library(ggplot2)
library(SingleCellExperiment)
library(tidyverse)
library(igraph)
library(scran)
devtools::load_all("~/milo/miloR/")
Using dyntoy, I simulate a scRNA-seq data with cells forming a linear trajectory (no branches) with 5000 cells and 10 main states (milestones).
set.seed(42)
dataset <- generate_dataset(
model = model_linear(num_milestones = 10),
num_cells = 5000,
num_features = 5000
)
|=============================================================================== | 95% ~0 s remaining
|================================================================================= | 97% ~0 s remaining
|=================================================================================== | 99% ~0 s remaining
gex <- as.matrix(dataset$expression)
branches <- dataset$prior_information$groups_id
## Dimensionality reduction
pca <- prcomp_irlba(gex, n=50, scale.=TRUE, center=TRUE)
X_pca = pca$x[, c(1:30)]
I assign cells to simulated biological conditions and replicates, that we will use for differential abundance testing. For each of the \(M\) clusters, I assign different proportions of cells to condition A or condition B, while simulating proportionate mixing between replicates.
Construct a Milo object
sim_milo
class: Milo
dim: 5000 5000
metadata(0):
assays(2): counts logcounts
rownames(5000): G1 G2 ... G4999 G5000
rowData names(0):
colnames(5000): C1 C2 ... C4999 C5000
colData names(4): group_id condition replicate sample
reducedDimNames(1): PCA
altExpNames(0):
neighbourhoods dimensions(1): 0
neighbourhoodCounts dimensions(2): 1 1
neighbourDistances dimensions(2): 1 1
graph names(0):
neighbourhoodIndex names(1): 0
sim_milo <- buildGraph(sim_milo, k = 20, d = 30)
Constructing kNN graph with k:20
Retrieving distances from 20 nearest neighbours
Using sampling of 10% random points
Generating simulations and then using the thymus ageing data to test differential abundance testing on a graph/tree.
library(ggplot2)
library(edgeR)
library(igraph)
library(SingleCellExperiment)
library(scran)
library(scater)
library(irlba)
library(ggthemes)
library(ggsci)
library(cydar)
library(mvtnorm)
library(umap)
library(reshape2)
I’ll start by simulating 3 clouds of points in \(\mathbb{R}^{n}\), each consisting of points from 2 pools, A & B. Each cloud of points will be composed of:
I will then perform a PCA on the matrix of these points and construct a KNN-graph, mimicing the standard workflow of many scRNA-seq analyses.
set.seed(42)
r.n <- 1000
n.dim <- 50
block1.cells <- 1200
# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues?
block1.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50))))
block1.eigens <- block1.eigens[order(block1.eigens)]
block1.p <- qr.Q(qr(matrix(rnorm(block1.cells^2, mean=4, sd=0.01), block1.cells)))
block1.sigma <- crossprod(block1.p, block1.p*block1.eigens)
block1.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block1.cells, mean=2, sd=0.01), sigma=block1.sigma))
block2.cells <- 1200
# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues?
block2.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50))))
block2.eigens <- block2.eigens[order(block2.eigens)]
block2.p <- qr.Q(qr(matrix(rnorm(block2.cells^2, mean=4, sd=0.01), block2.cells)))
block2.sigma <- crossprod(block2.p, block2.p*block2.eigens)
block2.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block2.cells, mean=4, sd=0.01), sigma=block2.sigma))
block3.cells <- 1250
# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues?
block3.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50))))
block3.eigens <- block3.eigens[order(block3.eigens)]
block3.p <- qr.Q(qr(matrix(rnorm(block3.cells^2, mean=4, sd=0.01), block3.cells)))
block3.sigma <- crossprod(block3.p, block3.p*block3.eigens)
block3.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block3.cells, mean=5, sd=0.01), sigma=block3.sigma))
sim1.gex <- do.call(cbind, list("b1"=block1.gex, "b2"=block2.gex, "b3"=block3.gex))
sim1.pca <- prcomp_irlba(t(sim1.gex), n=50, scale.=TRUE, center=TRUE)
pairs(sim1.pca$x[, c(1:5)])
I’ll use the reduced dimensions here to compute a KNN-graph and visualise it using a Fructerman-Reingold layout.
set.seed(42)
sim1.knn <- buildKNNGraph(x=sim1.pca$x[, c(1:30)], k=21, d=NA, transposed=TRUE)
sim1.fr.layout <- layout_with_fr(sim1.knn)
plot(sim1.knn, layout=sim1.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
Also a UMAP layout.
set.seed(42)
stem.ta.umap <- umap(sim1.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
plot(stem.ta.umap$layout, col=c(rep("red", block1.cells), rep("skyblue", block2.cells), rep("orange", block3.cells)),
xlab="UMAP 1", ylab="UMAP 2")
Within each of these clouds of points I will randomly label in the proportions above.
set.seed(42)
block1.cond <- rep("A", block1.cells)
block1.a <- sample(1:block1.cells, size=floor(block1.cells*0.9))
block1.b <- setdiff(1:block1.cells, block1.a)
block1.cond[block1.b] <- "B"
block2.cond <- rep("A", block2.cells)
block2.a <- sample(1:block2.cells, size=floor(block2.cells*0.05))
block2.b <- setdiff(1:block2.cells, block2.a)
block2.cond[block2.b] <- "B"
block3.cond <- rep("A", block3.cells)
block3.a <- sample(1:block3.cells, size=floor(block3.cells*0.5))
block3.b <- setdiff(1:block3.cells, block3.a)
block3.cond[block3.b] <- "B"
meta.df <- data.frame("Block"=c(rep("B1", block1.cells), rep("B2", block2.cells), rep("B3", block3.cells)),
"Condition"=c(block1.cond, block2.cond, block3.cond),
"Replicate"=c(rep("R1", floor(block1.cells*0.33)), rep("R2", floor(block1.cells*0.33)),
rep("R3", block1.cells-(2*floor(block1.cells*0.33))),
rep("R1", floor(block2.cells*0.33)), rep("R2", floor(block2.cells*0.33)),
rep("R3", block2.cells-(2*floor(block2.cells*0.33))),
rep("R1", floor(block3.cells*0.33)), rep("R2", floor(block3.cells*0.33)),
rep("R3", block3.cells-(2*floor(block3.cells*0.33)))))
meta.df <- cbind(meta.df, stem.ta.umap$layout)
colnames(meta.df) <- c("Block", "Condition", "Replicate", "UMAP1", "UMAP2")
# define a "sample" as teh combination of condition and replicate
meta.df$Sample <- paste(meta.df$Condition, meta.df$Replicate, sep="_")
meta.df$Vertex <- c(1:nrow(meta.df))
ggplot(meta.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Block, shape=Replicate)) +
theme_clean() +
scale_colour_npg() +
facet_wrap(~Condition) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
Using these simulated data we can define lots of neighbourhoods across the graph to create a counts matrix of neighbourhoods vs conditions in a similar way as cydar. I’ll start with random vertices in the graph making up 5% of all points and extract the graph neighborhoods.
# randomly select vertices in the graph
n.hood <- 0.05
random.vertices <- sample(V(sim1.knn), size=floor(n.hood*length(V(sim1.knn))))
# loop over random vertices and count the number of cells in each
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(sim1.knn, v=random.vertices[X]))
hist(unlist(lapply(vertex.list, length)), 100, main="Histogram of neighbors", xlab="Neighbourhood size")
For each neighbourhood I’ll count the number of cells in each, determined by the experimental design, i.e. replicate, condition and block.
quant_neighbourhood <- function(graph, meta, sample.column='Sample', sample.vertices=0.25, seed=42){
set.seed(seed)
# define a set of vertices and neihbourhood centers - extract the neihbourhoods of these cells
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
count.matrix <- matrix(0L, ncol=length(unique(meta[, sample.column])), nrow=length(vertex.list))
colnames(count.matrix) <- unique(meta[, sample.column])
for(x in seq_along(1:length(vertex.list))){
v.x <- vertex.list[[x]]
for(i in seq_along(1:length(unique(meta[, sample.column])))){
i.s <- unique(meta[, sample.column])[i]
i.s.vertices <- intersect(v.x, meta[meta[, sample.column] == i.s, ]$Vertex)
count.matrix[x, i] <- length(i.s.vertices)
}
}
rownames(count.matrix) <- c(1:length(vertex.list))
return(count.matrix)
}
# define a set of vertices and neihbourhood centers - extract the neihbourhoods of these cells
set.seed(42)
random.vertices <- sample(V(sim1.knn), size=floor(n.hood*length(V(sim1.knn))))
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(sim1.knn, v=random.vertices[X]))
count.matrix <- matrix(0L, ncol=length(unique(meta.df[, "Sample"])), nrow=length(vertex.list))
colnames(count.matrix) <- unique(meta.df[, "Sample"])
for(x in seq_along(1:length(vertex.list))){
v.x <- vertex.list[[x]]
for(i in seq_along(1:length(unique(meta.df[, "Sample"])))){
i.s <- unique(meta.df[, "Sample"])[i]
i.s.vertices <- intersect(v.x, meta.df[meta.df[, "Sample"] == i.s, ]$Vertex)
count.matrix[x, i] <- length(i.s.vertices)
}
}
rownames(count.matrix) <- c(1:length(vertex.list))
sim1.counts <- quant_neighbourhood(graph=sim1.knn, meta=meta.df, sample.column='Sample', sample.vertices=n.hood)
sample.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)),
"Replicate"=rep(c("R1", "R2", "R3"), 2))
sample.meta$Sample <- paste(sample.meta$Condition, sample.meta$Replicate, sep="_")
rownames(sample.meta) <- sample.meta$Sample
# sim1.model <- model.matrix(~ 0 + Condition, data=sample.meta)
sim1.model <- model.matrix(~ Condition, data=sample.meta)
head(sim1.model)
(Intercept) ConditionB
A_R1 1 0
A_R2 1 0
A_R3 1 0
B_R1 1 1
B_R2 1 1
B_R3 1 1
I have a model matrix and counts matrix - let’s test edgeR on these.
sim1.dge <- DGEList(sim1.counts[, rownames(sim1.model)], lib.size=log(colSums(sim1.counts[, rownames(sim1.model)])))
sim1.dge <- estimateDisp(sim1.dge, sim1.model)
sim1.fit <- glmQLFit(sim1.dge, sim1.model, robust=TRUE)
# sim1.contrast <- makeContrasts(ConditionA - ConditionB, levels=sim1.model)
# sim1.res <- glmQLFTest(sim1.fit, contrast=sim1.contrast)
sim1.res <- as.data.frame(topTags(glmQLFTest(sim1.fit, coef=2), sort.by='none', n=Inf))
sim1.res$Sig <- as.factor(as.numeric(sim1.res$FDR <= 0.01))
sim1.res$Neighbourhood <- as.numeric(rownames(sim1.res))
# control the spatial FDR
qvals <- sim1.res$PValue
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 59 123
This indicates that 123 of the neighbour hoods are significant before spatial FDR correction. The implementation in Cydar relies on having the median marker intensities for each hypersphere. What is the equivalent positional information here? What about drawing a new graph for each neighborhood using the igraph::induced_subgraph function, then calculate the connectivity of this new sub-graph as a measure of the neighborhood density?
# this creates a sub-graph for each of the random vertices
test.subgraphs <- lapply(1:length(random.vertices),
FUN=function(X) induced_subgraph(sim1.knn, vertex.list[[X]]))
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
test.connect <- lapply(test.subgraphs, FUN=function(EG) vertex_connectivity(EG))
Vertex connectivity is slow to compute - edge connectivity might be faster and has the useful property that:
\[k(G) \leq \lambda(G) \] That is, the vertex connectivity \(k(G)\) is \(\leq\) than the edge connectivity \(\lambda(G)\). In this instance the edge-connectivity is an upper-bound on the neighborhood connectivity, and thus an upper bound on the density, leading to a slightly lower weighting for the spatial FDR adjustment.
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
edge.connect <- lapply(test.subgraphs, FUN=function(EG) edge_connectivity(EG))
The edge-connectivity takes ~10% of the time. How do they compare?
plot(unlist(test.connect), unlist(edge.connect), main="Graph-connectivity measures", xlab="Vertex-connectivity", ylab="Edge-connectivity")
For the most part the vertex-connectivity and edge-connectivity for neighborhoods are in pretty good agreement. I’ll maybe include the option to use either as a function parameter.
graph_spatialFDR <- function(neighborhoods, graph, pvalues, connectivity='vertex'){
# input a set of neighborhoods as a list of graph vertices
# the input graph and the unadjusted GLM p-values
#' neighborhoods: list of vertices and their respective neighborhoods
#' graph: input kNN graph
#' pvalues: a vector of pvalues in the same order as the neighborhood indices
#' connectivity: character - edge or vertex to calculate neighborhood connectivity
# Discarding NA pvalues.
haspval <- !is.na(pvalues)
if (!all(haspval)) {
coords <- coords[haspval, , drop=FALSE]
pvalues <- pvalues[haspval]
}
# define the subgraph for each neighborhood then calculate the vertex connectivity for each
# this latter computation is quite slow - can it be sped up?
subgraphs <- lapply(1:length(neighborhoods[haspval]),
FUN=function(X) induced_subgraph(graph, neighborhoods[haspval][[X]]))
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
if(connectivity == "vertex"){
connect <- lapply(subgraphs, FUN=function(EG) vertex_connectivity(EG))
} else if(connectivity == "edge"){
connect <- lapply(subgraphs, FUN=function(EG) edge_connectivity(EG))
} else{
errorCondition("connectivity option not recognised - must be either edge or vertex")
}
# use 1/connectivity as the weighting for the weighted BH adjustment from Cydar
w <- 1/unlist(connect)
# set Inf to 0
w[is.infinite(w)] <- 0
# Computing a density-weighted q-value.
o <- order(pvalues)
pvalues <- pvalues[o]
w <- w[o]
adjp <- numeric(length(o))
adjp[o] <- rev(cummin(rev(sum(w)*pvalues/cumsum(w))))
adjp <- pmin(adjp, 1)
if (!all(haspval)) {
refp <- rep(NA_real_, length(haspval))
refp[haspval] <- adjp
adjp <- refp
}
return(adjp)
}
start.time <- Sys.time()
sim1.spatialfdr <- graph_spatialFDR(neighborhoods=vertex.list, graph=sim1.knn, connectivity="edge",
pvalues=sim1.res[order(sim1.res$Neighbourhood), ]$PValue)
end.time <- Sys.time()
connect.time <- end.time - start.time
sim1.res$SpatialFDR[order(sim1.res$Neighbourhood)] <- sim1.spatialfdr
qvals <- sim1.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 62 120
How do these compare to the standard FDR adjustment?
plot(-log10(sim1.res[order(sim1.res$Neighbourhood), ]$FDR),
-log10(sim1.spatialfdr), main="FDR comparison", xlab="-log10 FDR", ylab="-log10 Spatial FDR")
In this instance there is not a huge difference. That seems to work fairly well. Does it make sense though? i.e. are the changes in the graph in the expected regions? I’ll follow the Cydar workflow for this and project the neighbourhoods into a reduced dimensional space.
neighborhood_expression <- function(neighborhoods, data.set){
# I'll calculate the average value of each neighborhood for each of the n features in the data.set
neighbour.model <- matrix(0L, ncol=length(neighborhoods), nrow=ncol(data.set))
# neighbour.model <- sapply(1:length(neighborhoods), FUN=function(X) print(length(neighbour.model[, X])))
for(x in seq_along(1:length(neighborhoods))){
neighbour.model[neighborhoods[[x]], x] <- 1
}
neigh.exprs <- t(neighbour.model) %*% t(data.set)
neigh.exprs <- t(apply(neigh.exprs, 2, FUN=function(XP) XP/colSums(neighbour.model)))
return(neigh.exprs)
}
sim1.neighbour.exprs <- neighborhood_expression(vertex.list, sim1.gex)
Embed these hyperspheres with a PCA and UMAP.
sim1.neighbour.pca <- prcomp((t(sim1.neighbour.exprs)))
pairs(sim1.neighbour.pca$x[, c(1:5)])
set.seed(42)
neighbourhood.umap <- umap(sim1.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
plot(neighbourhood.umap$layout,
xlab="UMAP 1", ylab="UMAP 2")
We can overlay the DA testing on these neighbourhoods.
neighbor.df <- sim1.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
neighbor.df <- do.call(cbind.data.frame, list(neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
neighbor.df$Sig <- as.numeric(neighbor.df$SpatialFDR <= 0.05)
ggplot(neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbor.df[neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=neighbor.df[neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
In each neighbourhood, what is the most common condition or block of cells?
neighbour.list <- list()
for(x in seq_along(1:length(vertex.list))){
x.df <- meta.df[meta.df$Vertex %in% vertex.list[[x]], ]
x.rep <- names(table(x.df$Replicate))[which(table(x.df$Replicate) == max(table(x.df$Replicate)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Block))[which(table(x.df$Block) == max(table(x.df$Block)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Condition))[which(table(x.df$Condition) == max(table(x.df$Condition)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Block"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
neighbour.meta <- do.call(rbind.data.frame, neighbour.list)
neighbour.merge <- merge(neighbor.df, neighbour.meta, by='Neighbourhood')
neighbour.merge$Block <- ordered(neighbour.merge$Block,
levels=c("B1", "B2", "B3"))
neighbour.merge$Diff <- sign(neighbour.merge$logFC)
neighbour.merge$Diff[neighbour.merge$Sig == 0] <- 0
ggplot(neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=neighbour.merge[neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Block)
This doesn’t make sense - Block 3 shouldn’t have any DA neighbourhoods. Is this a compositional effect we’re seeing here? It’s strange that a random fluctuation would cause this - it must be incredibly sensitive.
table(neighbour.merge$Block, neighbour.merge$Diff)
-1 0 1
B1 66 0 0
B2 0 0 56
B3 3 57 0
Is this due to some weird global scaling differences or is this a compositional effect? In addition to the high false-positive rate, there are also sign differences where they should be concordant within a block of data points. For each neighbourhood, how much does it overlap with the over blocks? Intuitively I would have expected 0 as they are well separated, however, this might be a cause for all these false DA neighbourhoods.
NB: This was due to different vertices being sampled, so the results weren’t concordant - set the same seed Mike!!!!
The false-positive rate is a little high here, but at least the signs are the correct way around.
plot(x=sim1.res$logFC, y=-log10(sim1.res$SpatialFDR), xlab="log FC", ylab="Spatial FDR",
col=ifelse(sim1.res$Sig == 1, "red", "black"))
For each neighbourhood, I need to count the number of points in each block, as well as condition and replicate.
all.samps <- unique(paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_"))
meta.df$All.Sample <- paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_")
all.count.matrix <- matrix(0L, ncol=length(all.samps), nrow=length(vertex.list))
colnames(all.count.matrix) <- all.samps
for(x in seq_along(1:length(vertex.list))){
v.x <- vertex.list[[x]]
for(i in seq_along(1:length(all.samps))){
i.s <- all.samps[i]
i.s.vertices <- intersect(v.x, meta.df[meta.df$All.Sample == i.s, ]$Vertex)
all.count.matrix[x, i] <- length(i.s.vertices)
}
}
all.count.melt <- melt(all.count.matrix)
all.count.melt$Var2 <- as.character(all.count.melt$Var2)
all.count.melt$Block <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[1])))
all.count.melt$Condition <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[2])))
all.count.melt$Replicate <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[3])))
ggplot(all.count.melt, aes(x=Block, y=value, fill=Condition)) +
geom_boxplot() +
theme_clean() +
facet_wrap(~Var1, scales="free_y")
Each panel is a neighbourhood, the numbers the counts of data points in each that com from either condition, and in the block of points. These are now concordant with the DA results, except neighbourhoods 7 and 26 - which look like a combination of sampling variance, which then creates a compositional effect. Either a filter on low-count neighbourhoods or on the log fold changes would ameliorate this.
NB: Emma suggested using the within neighbourhood distances instead of connectivity as a measure of density. This would require making our own KNN-graph building function that retains the distances between all pairs of vertices. In truth, this could just be added as the edge weights to the KNN-graph, which are ultimately ignored for the purpose of graph building, i.e. all edges = 1.
We have the PC coordinates that were used in the original KNN graph building, so strictly we just need this with the neighbourhood list to calculate the Euclidean distances. For the neighbourhood, do we take the average off-diagonal distance as the density?
neighbour.dist.list <- list()
for(x in seq_along(1:length(vertex.list))){
x.verts <- vertex.list[[x]]
x.pcs <- sim1.pca$x[x.verts, c(1:30)]
x.euclid <- as.matrix(dist(x.pcs))
x.distdens <- 1/mean(x.euclid[lower.tri(x.euclid, diag=FALSE)])
neighbour.dist.list[[x]] <- x.distdens
}
hist(unlist(neighbour.dist.list), 100, xlab="1/mean Euclidean distance")
This looks a little like the 3 blocks of points… Let’s try this for weighting the spatial FDR.
graph_spatialFDR <- function(neighborhoods, graph, pvalues, connectivity='vertex', pca=NULL){
# input a set of neighborhoods as a list of graph vertices
# the input graph and the unadjusted GLM p-values
#' neighborhoods: list of vertices and their respective neighborhoods
#' graph: input kNN graph
#' pvalues: a vector of pvalues in the same order as the neighborhood indices
#' connectivity: character - edge or vertex to calculate neighborhood connectivity or distance to use average Euclidean distance
#' pca: matrix of PCs to calculate Euclidean distances, only required when connectivity == distance
# Discarding NA pvalues.
haspval <- !is.na(pvalues)
if (!all(haspval)) {
coords <- coords[haspval, , drop=FALSE]
pvalues <- pvalues[haspval]
}
# define the subgraph for each neighborhood then calculate the vertex connectivity for each
# this latter computation is quite slow - can it be sped up?
subgraphs <- lapply(1:length(neighborhoods[haspval]),
FUN=function(X) induced_subgraph(graph, neighborhoods[haspval][[X]]))
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
if(connectivity == "vertex"){
t.connect <- lapply(subgraphs, FUN=function(EG) vertex_connectivity(EG))
} else if(connectivity == "edge"){
t.connect <- lapply(subgraphs, FUN=function(EG) edge_connectivity(EG))
} else if(connectivity == "distance"){
if(!is.null(pca)){
t.connect <- lapply(1:length(neighborhoods[haspval]),
FUN=function(PG) {
x.pcs <- pca[neighborhoods[haspval][[PG]], ]
x.euclid <- as.matrix(dist(x.pcs))
x.distdens <- 1/mean(x.euclid[lower.tri(x.euclid, diag=FALSE)])
return(x.distdens)})
} else{
errorCondition("A matrix of PCs is required to calculate distances")
}
}else{
errorCondition("connectivity option not recognised - must be either edge, vertex or distance")
}
# use 1/connectivity as the weighting for the weighted BH adjustment from Cydar
w <- 1/unlist(t.connect)
w[is.infinite(w)] <- 0
# Computing a density-weighted q-value.
o <- order(pvalues)
pvalues <- pvalues[o]
w <- w[o]
adjp <- numeric(length(o))
adjp[o] <- rev(cummin(rev(sum(w)*pvalues/cumsum(w))))
adjp <- pmin(adjp, 1)
if (!all(haspval)) {
refp <- rep(NA_real_, length(haspval))
refp[haspval] <- adjp
adjp <- refp
}
return(adjp)
}
start.time <- Sys.time()
sim1.spatialfdr <- graph_spatialFDR(neighborhoods=vertex.list, graph=sim1.knn, connectivity="distance",
pca=sim1.pca$x[, c(1:30)],
pvalues=sim1.res[order(sim1.res$Neighbourhood), ]$PValue)
end.time <- Sys.time()
distance.time <- end.time - start.time
sim1.res$SpatialFDR.Dist[order(sim1.res$Neighbourhood)] <- sim1.spatialfdr
qvals <- sim1.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 62 120
How does this compare with using connectivity for the spatial FDR correction?
sink(file="/dev/null")
pdf("~/Dropbox/Milo/simulations/Connectivity_vs_distance_SpatialFDR.pdf",
heigh=3.15, width=4.25, useDingbats=FALSE)
plot(-log10(sim1.res$SpatialFDR.Dist), -log10(sim1.res$SpatialFDR), xlab="-log10 distance Spatial FDR",
ylab="-log10 connectivity Spatial FDR")
dev.off()
sink(file=NULL)
plot(-log10(sim1.res$SpatialFDR.Dist), -log10(sim1.res$SpatialFDR), xlab="-log10 distance Spatial FDR",
ylab="-log10 connectivity Spatial FDR")
Aside from the small number of false-positive samples, this works fairly well on these simulated data. Can we also get similar results with a real-world data? For this ’ll test the 1 and 52 week old TEC from our Ageing thymus paper, as these have the biggest differences between them.
# read in normalised data, subset to 1 and 52 week old TEC, do a PCA and construct a kNN graph.
tec.meta <- read.table("~/Dropbox/AgeingExperiment/Frozen/ThymusMarker_tSNE_PCA_meta.tsv",
sep="\t", header=TRUE, stringsAsFactors=FALSE)
# exclude technical artifact cluster
tec.meta <- tec.meta[!tec.meta$TFIDF.Cluster %in% c(4), ]
tec.sub.meta <- tec.meta[tec.meta$Age %in% c("1wk", "52wk"), ]
# add the label annotation
tec.sub.meta$Cluster <- "Unknown"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "2"] <- "Intertypical TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "9"] <- "Perinatal cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "3"] <- "Mature cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "7"] <- "Mature mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "1"] <- "Post-Aire mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "5"] <- "Tuft-like mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "6"] <- "Proliferating TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "8"] <- "nTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "10"] <- "sTEC"
inter.cols <- c("#9970ab", "#35978f", "#B0cdc1", "#762a83", "#01665e", "#e7d4e8", "#dfc27d", "#8c510a" ,"#bf812d")
names(inter.cols) <- c("Post-Aire mTEC", 'Intertypical TEC', 'Mature cTEC', 'Tuft-like mTEC',
'Proliferating TEC', 'Mature mTEC', 'nTEC', 'Perinatal cTEC', 'sTEC')
tec.gex <- read.table("~/Dropbox/AgeingExperiment/Frozen/ThymusQC_SFnorm.tsv.gz",
sep="\t", header=TRUE, stringsAsFactors=FALSE)
tec.sub.gex <- tec.gex[, colnames(tec.gex) %in% tec.sub.meta$Sample]
tec.hvgs <- read.table("~/Dropbox/AgeingExperiment/Frozen/Thymus_HVG.tsv",
sep="\t", header=TRUE, stringsAsFactors=FALSE)
I’ll build a kNN-graph from the first 30 PCs previously computed on all TEC.
set.seed(42)
tec.knn <- buildKNNGraph(x=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]), k=21, d=NA, transposed=TRUE)
tec.fr.layout <- layout_with_fr(tec.knn)
plot(tec.knn, layout=tec.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
This is a fairly densely connected network, how does the UMAP look?
set.seed(42)
tec.umap <- umap(as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.2)
tec.umap.df <- as.data.frame(tec.umap$layout)
colnames(tec.umap.df) <- c("UMAP1", "UMAP2")
tec.umap.df$Sample <- tec.sub.meta$Sample
tec.umap.merge <- merge(tec.umap.df, tec.sub.meta, by='Sample')
ggplot(tec.umap.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Cluster)) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~Age) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
# randomly select vertices in the graph
n.hood <- 0.05
tec.random.vertices <- sample(V(tec.knn), size=floor(n.hood*length(V(tec.knn))))
# loop over random vertices and count the number of cells in each
tec.vertex.list <- sapply(1:length(tec.random.vertices), FUN=function(X) neighbors(tec.knn, v=tec.random.vertices[X]))
hist(unlist(lapply(tec.vertex.list, length)), 100, main="Histogram of neighbors", xlab="Neighbourhood size")
This is the histogram of TEC neighbourhood sizes.
tec.umap.merge$ExpSamp <- paste(tec.umap.merge$Age, tec.umap.merge$SortDay, sep="_")
tec.umap.merge$Vertex <- c(1:nrow(tec.umap.merge))
tec.counts <- quant_neighbourhood(graph=tec.knn, meta=tec.umap.merge, sample.column='ExpSamp', sample.vertices=n.hood)
tec.reps <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[2])))
tec.cond <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[1])))
tec.sample.meta <- data.frame("Condition"=tec.cond,
"Replicate"=tec.reps)
tec.sample.meta$Sample <- paste(tec.sample.meta$Condition, tec.sample.meta$Replicate, sep="_")
rownames(tec.sample.meta) <- tec.sample.meta$Sample
tec.model <- model.matrix(~ Condition, data=tec.sample.meta)
head(tec.model)
(Intercept) Condition52wk
1wk_4 1 0
52wk_5 1 1
1wk_5 1 0
1wk_2 1 0
1wk_1 1 0
1wk_3 1 0
tec.dge <- DGEList(tec.counts[, rownames(tec.model)], lib.size=log(colSums(tec.counts[, rownames(tec.model)])))
tec.dge <- estimateDisp(tec.dge, tec.model)
tec.fit <- glmQLFit(tec.dge, tec.model, robust=TRUE)
# tec.contrast <- makeContrasts(ConditionA - ConditionB, levels=tec.model)
# tec.res <- glmQLFTest(tec.fit, contrast=tec.contrast)
tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=2), sort.by='none', n=Inf))
tec.res$Sig <- as.factor(as.numeric(tec.res$FDR <= 0.01))
tec.res$Neighbourhood <- as.numeric(rownames(tec.res))
# control the spatial FDR
qvals <- tec.res$PValue
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 27 18
There are 18 DA neighbourhoods - I expect these should reflect the Perinatal, Intertypical, Proliferating and sTEC. I’ll use the distance-based approach to correct for the spatial FDR.
tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=tec.res[order(tec.res$Neighbourhood), ]$PValue)
tec.res$SpatialFDR[order(tec.res$Neighbourhood)] <- tec.spatialfdr
qvals <- tec.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 30 15
Interesting, 3 neighbourhoods are no longer statistically significant after the spatial FDR correction - hopefully these are genuinely false-positives.
In each neighbourhood, what is the most common condition or block of cells?
tec.neighbour.exprs <- neighborhood_expression(tec.vertex.list, tec.sub.gex)
Embed these hyperspheres with a PCA and UMAP.
tec.neighbour.pca <- prcomp((t(tec.neighbour.exprs[tec.hvgs$HVG, ])))
pairs(tec.neighbour.pca$x[, c(1:5)])
set.seed(42)
neighbourhood.umap <- umap(tec.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
We can overlay the DA testing on these neighbourhoods.
tec.neighbor.df <- tec.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
tec.neighbor.df <- do.call(cbind.data.frame, list(tec.neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(tec.neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
tec.neighbor.df$Sig <- as.numeric(tec.neighbor.df$SpatialFDR <= 0.05)
ggplot(tec.neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
Some are up and some are down. Which TEC subtypes do they largely correspond with?
tec.neighbour.list <- list()
for(x in seq_along(1:length(tec.vertex.list))){
x.df <- tec.umap.merge[tec.umap.merge$Vertex %in% tec.vertex.list[[x]], ]
x.rep <- names(table(x.df$SortDay))[which(table(x.df$SortDay) == max(table(x.df$SortDay)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Cluster))[which(table(x.df$Cluster) == max(table(x.df$Cluster)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Age))[which(table(x.df$Age) == max(table(x.df$Age)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
tec.neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Cluster"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
tec.neighbour.meta <- do.call(rbind.data.frame, tec.neighbour.list)
tec.neighbour.merge <- merge(tec.neighbor.df, tec.neighbour.meta, by='Neighbourhood')
tec.neighbour.merge$Diff <- sign(tec.neighbour.merge$logFC)
tec.neighbour.merge$Diff[tec.neighbour.merge$Sig == 0] <- 0
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge[tec.neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Cluster)
There is a subset of the Proliferating TEC that are down, good, as are the Perinatal cTEC. Likewise, most of the Intertypical TEC are up as well, but some are also down. I don’t know if this is because of a compositional effect or because these are the ones that would be differentiating into mTEC via the Proliferating TEC compartment.
table(tec.neighbour.merge$Cluster, tec.neighbour.merge$Diff)
-1 0 1
Mature mTEC 0 16 0
Intertypical TEC 2 6 10
Perinatal cTEC 4 0 0
Mature cTEC 0 1 3
Post-Aire mTEC 0 2 0
Proliferating TEC 1 0 0
I would say that these results make a lot of sense. Can I now extend this to include all time points and fit age as a linear ordinal variable?
# exclude technical artifact cluster
tec.meta <- tec.meta[!tec.meta$TFIDF.Cluster %in% c(4), ]
tec.sub.meta <- tec.meta
tec.sub.meta$AgeFactor <- ordered(tec.sub.meta$Age,
levels=c("1wk", "4wk", "16wk", "32wk", "52wk"))
# add the label annotation
tec.sub.meta$Cluster <- "Unknown"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "2"] <- "Intertypical TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "9"] <- "Perinatal cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "3"] <- "Mature cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "7"] <- "Mature mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "1"] <- "Post-Aire mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "5"] <- "Tuft-like mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "6"] <- "Proliferating TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "8"] <- "nTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "10"] <- "sTEC"
inter.cols <- c("#9970ab", "#35978f", "#B0cdc1", "#762a83", "#01665e", "#e7d4e8", "#dfc27d", "#8c510a" ,"#bf812d")
names(inter.cols) <- c("Post-Aire mTEC", 'Intertypical TEC', 'Mature cTEC', 'Tuft-like mTEC',
'Proliferating TEC', 'Mature mTEC', 'nTEC', 'Perinatal cTEC', 'sTEC')
tec.sub.gex <- tec.gex[, colnames(tec.gex) %in% tec.sub.meta$Sample]
I’ll build a kNN-graph from the first 30 PCs previously computed on all TEC.
set.seed(42)
tec.knn <- buildKNNGraph(x=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]), k=21, d=NA, transposed=TRUE)
tec.fr.layout <- layout_with_fr(tec.knn)
plot(tec.knn, layout=tec.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
This is a fairly densely connected network, how does the UMAP look?
set.seed(42)
tec.umap <- umap(as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.2)
tec.umap.df <- as.data.frame(tec.umap$layout)
colnames(tec.umap.df) <- c("UMAP1", "UMAP2")
tec.umap.df$Sample <- tec.sub.meta$Sample
tec.umap.merge <- merge(tec.umap.df, tec.sub.meta, by='Sample')
ggplot(tec.umap.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Cluster)) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~AgeFactor) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
# randomly select vertices in the graph
n.hood <- 0.05
tec.random.vertices <- sample(V(tec.knn), size=floor(n.hood*length(V(tec.knn))))
# loop over random vertices and count the number of cells in each
tec.vertex.list <- sapply(1:length(tec.random.vertices), FUN=function(X) neighbors(tec.knn, v=tec.random.vertices[X]))
hist(unlist(lapply(tec.vertex.list, length)), 100, main="Histogram of neighbors", xlab="Neighbourhood size")
This is the histogram of TEC neighbourhood sizes.
tec.umap.merge$ExpSamp <- paste(tec.umap.merge$Age, tec.umap.merge$SortDay, sep="_")
tec.umap.merge$Vertex <- c(1:nrow(tec.umap.merge))
tec.counts <- quant_neighbourhood(graph=tec.knn, meta=tec.umap.merge, sample.column='ExpSamp', sample.vertices=n.hood)
tec.reps <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[2])))
tec.cond <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[1])))
tec.sample.meta <- data.frame("Condition"=tec.cond,
"Replicate"=tec.reps)
tec.sample.meta$Sample <- paste(tec.sample.meta$Condition, tec.sample.meta$Replicate, sep="_")
rownames(tec.sample.meta) <- tec.sample.meta$Sample
tec.sample.meta$Condition <- ordered(tec.sample.meta$Condition,
levels=c("1wk", "4wk", "16wk", "32wk", "52wk"))
tec.model <- model.matrix(~ Condition, data=tec.sample.meta)
head(tec.model)
(Intercept) Condition.L Condition.Q Condition.C Condition^4
1wk_4 1 -0.6324555 0.5345225 -0.3162278 0.1195229
4wk_3 1 -0.3162278 -0.2672612 0.6324555 -0.4780914
32wk_2 1 0.3162278 -0.2672612 -0.6324555 -0.4780914
32wk_5 1 0.3162278 -0.2672612 -0.6324555 -0.4780914
52wk_5 1 0.6324555 0.5345225 0.3162278 0.1195229
4wk_2 1 -0.3162278 -0.2672612 0.6324555 -0.4780914
tec.dge <- DGEList(tec.counts[, rownames(tec.model)], lib.size=log(colSums(tec.counts[, rownames(tec.model)])))
tec.dge <- estimateDisp(tec.dge, tec.model)
tec.fit <- glmQLFit(tec.dge, tec.model, robust=TRUE)
# tec.contrast <- makeContrasts(ConditionA - ConditionB, levels=tec.model)
# tec.res <- glmQLFTest(tec.fit, contrast=tec.contrast)
tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=2), sort.by='none', n=Inf))
tec.res$Sig <- as.factor(as.numeric(tec.res$FDR <= 0.01))
tec.res$Neighbourhood <- as.numeric(rownames(tec.res))
# control the spatial FDR
qvals <- tec.res$PValue
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 70 46
There are 46 DA neighbourhoods - I expect these should reflect the Perinatal, Intertypical, Proliferating and sTEC. I’ll use the distance-based approach to correct for the spatial FDR.
tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=tec.res[order(tec.res$Neighbourhood), ]$PValue)
tec.res$SpatialFDR[order(tec.res$Neighbourhood)] <- tec.spatialfdr
qvals <- tec.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 82 34
Interesting, 12 neighbourhoods are no longer statistically significant after the spatial FDR correction - hopefully these are genuinely false-positives.
In each neighbourhood, what is the most common condition or block of cells?
tec.neighbour.exprs <- neighborhood_expression(tec.vertex.list, tec.sub.gex)
Embed these hyperspheres with a PCA and UMAP.
tec.neighbour.pca <- prcomp((t(tec.neighbour.exprs[tec.hvgs$HVG, ])))
pairs(tec.neighbour.pca$x[, c(1:5)])
set.seed(42)
neighbourhood.umap <- umap(tec.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
We can overlay the DA testing on these neighbourhoods.
tec.neighbor.df <- tec.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
tec.neighbor.df <- do.call(cbind.data.frame, list(tec.neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(tec.neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
tec.neighbor.df$Sig <- as.numeric(tec.neighbor.df$SpatialFDR <= 0.05)
ggplot(tec.neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
Some are up and some are down. Which TEC subtypes do they largely correspond with? That big streak of lower abundance neighbourhoods should be the differentiation trajectory from Intertypical to Mature mTEC.
tec.neighbour.list <- list()
for(x in seq_along(1:length(tec.vertex.list))){
x.df <- tec.umap.merge[tec.umap.merge$Vertex %in% tec.vertex.list[[x]], ]
x.rep <- names(table(x.df$SortDay))[which(table(x.df$SortDay) == max(table(x.df$SortDay)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Cluster))[which(table(x.df$Cluster) == max(table(x.df$Cluster)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Age))[which(table(x.df$Age) == max(table(x.df$Age)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
tec.neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Cluster"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
tec.neighbour.meta <- do.call(rbind.data.frame, tec.neighbour.list)
tec.neighbour.merge <- merge(tec.neighbor.df, tec.neighbour.meta, by='Neighbourhood')
tec.neighbour.merge$Diff <- sign(tec.neighbour.merge$logFC)
tec.neighbour.merge$Diff[tec.neighbour.merge$Sig == 0] <- 0
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge[tec.neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Cluster)
This very nicely recapitulates the DA testing using clusters, and it pinpoints the loss of medulla-biased Intertypical TEC which we only really observed in our larger experiments. This is working beyond my wildest dreams!!
table(tec.neighbour.merge$Cluster, tec.neighbour.merge$Diff)
-1 0 1
Mature mTEC 4 32 2
Proliferating TEC 7 4 0
Intertypical TEC 8 12 18
Perinatal cTEC 5 0 0
sTEC 0 0 1
Post-Aire mTEC 0 7 0
Mature cTEC 0 6 5
nTEC 0 2 0
Tuft-like mTEC 0 3 0
I would say that these results make a lot of sense. I’ll extend it to include the quadratic testing which should pick up the inverse-parabolic profile of the Post-Aire mTEC population.
quad.tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=3), sort.by='none', n=Inf))
quad.tec.res$Sig <- as.factor(as.numeric(quad.tec.res$FDR <= 0.01))
quad.tec.res$Neighbourhood <- as.numeric(rownames(quad.tec.res))
# control the spatial FDR
qvals <- quad.tec.res$PValue
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 110 6
There are 6 DA neighbourhoods from the quadratic model - I expect these should reflect the Post-Aire mTEC.
quad.tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=quad.tec.res[order(quad.tec.res$Neighbourhood), ]$PValue)
quad.tec.res$SpatialFDR[order(quad.tec.res$Neighbourhood)] <- quad.tec.spatialfdr
qvals <- quad.tec.spatialfdr
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE
logical 116
Dang, clearly the small group of Post-Aire mTEC means this isn’t sufficiently sensitive after multiple-testing correction.
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge,
aes(colour=Cluster), size=4) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~Cluster)
Hmm, the Post-Aire mTEC don’t form a single neighbourhood on their own, nor do the nTEC or Tuft-like mTEC. There is definitely a limit to the resolution. Would a smaller \(k\) resolve this?
tec.sub.meta <- tec.meta[tec.meta$Age %in% c("1wk", "52wk"), ]
# add the label annotation
tec.sub.meta$Cluster <- "Unknown"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "2"] <- "Intertypical TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "9"] <- "Perinatal cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "3"] <- "Mature cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "7"] <- "Mature mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "1"] <- "Post-Aire mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "5"] <- "Tuft-like mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "6"] <- "Proliferating TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "8"] <- "nTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "10"] <- "sTEC"
inter.cols <- c("#9970ab", "#35978f", "#B0cdc1", "#762a83", "#01665e", "#e7d4e8", "#dfc27d", "#8c510a" ,"#bf812d")
names(inter.cols) <- c("Post-Aire mTEC", 'Intertypical TEC', 'Mature cTEC', 'Tuft-like mTEC',
'Proliferating TEC', 'Mature mTEC', 'nTEC', 'Perinatal cTEC', 'sTEC')
I’ll build a kNN-graph from the first 30 PCs previously computed on all TEC, but k=11 this time.
set.seed(42)
tec.knn <- buildKNNGraph(x=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]), k=11, d=NA, transposed=TRUE)
tec.fr.layout <- layout_with_fr(tec.knn)
plot(tec.knn, layout=tec.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
This is a fairly densely connected network still, even with k=11, how does the UMAP look?
set.seed(42)
tec.umap <- umap(as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
n_components=2,
n_neighbors=11, metric='euclidean',
init='random', min_dist=0.2)
tec.umap.df <- as.data.frame(tec.umap$layout)
colnames(tec.umap.df) <- c("UMAP1", "UMAP2")
tec.umap.df$Sample <- tec.sub.meta$Sample
tec.umap.merge <- merge(tec.umap.df, tec.sub.meta, by='Sample')
ggplot(tec.umap.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Cluster)) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~Age) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
For a smaller k, does there need to be a higher density of sampling, i.e. more neighbourhoods? I’ll set it to 10% here instead.
# randomly select vertices in the graph
n.hood <- 0.10
tec.random.vertices <- sample(V(tec.knn), size=floor(n.hood*length(V(tec.knn))))
# loop over random vertices and count the number of cells in each
tec.vertex.list <- sapply(1:length(tec.random.vertices), FUN=function(X) neighbors(tec.knn, v=tec.random.vertices[X]))
hist(unlist(lapply(tec.vertex.list, length)), 100, main="Histogram of neighbors", xlab="Neighbourhood size")
This is the histogram of TEC neighbourhood sizes.
tec.umap.merge$ExpSamp <- paste(tec.umap.merge$Age, tec.umap.merge$SortDay, sep="_")
tec.umap.merge$Vertex <- c(1:nrow(tec.umap.merge))
tec.counts <- quant_neighbourhood(graph=tec.knn, meta=tec.umap.merge, sample.column='ExpSamp', sample.vertices=n.hood)
tec.reps <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[2])))
tec.cond <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[1])))
tec.sample.meta <- data.frame("Condition"=tec.cond,
"Replicate"=tec.reps)
tec.sample.meta$Sample <- paste(tec.sample.meta$Condition, tec.sample.meta$Replicate, sep="_")
rownames(tec.sample.meta) <- tec.sample.meta$Sample
tec.model <- model.matrix(~ Condition, data=tec.sample.meta)
head(tec.model)
(Intercept) Condition52wk
1wk_4 1 0
52wk_5 1 1
1wk_5 1 0
1wk_2 1 0
1wk_1 1 0
1wk_3 1 0
tec.dge <- DGEList(tec.counts[, rownames(tec.model)], lib.size=log(colSums(tec.counts[, rownames(tec.model)])))
tec.dge <- estimateDisp(tec.dge, tec.model)
tec.fit <- glmQLFit(tec.dge, tec.model, robust=TRUE)
# tec.contrast <- makeContrasts(ConditionA - ConditionB, levels=tec.model)
# tec.res <- glmQLFTest(tec.fit, contrast=tec.contrast)
tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=2), sort.by='none', n=Inf))
tec.res$Sig <- as.factor(as.numeric(tec.res$FDR <= 0.05))
tec.res$Neighbourhood <- as.numeric(rownames(tec.res))
# control the spatial FDR
qvals <- tec.res$PValue
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE TRUE
logical 43 48
I have increased the total number of neighbourhoods defined with k=11, there will almost certainly be a trade-off between sensitivity and power w.r.t. the extra multiple-testing burden, as well as the higher sampling variance as each neighbourhood will contain fewer cells <- this will be something we need to optimise in some way.
tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=tec.res[order(tec.res$Neighbourhood), ]$PValue)
tec.res$SpatialFDR[order(tec.res$Neighbourhood)] <- tec.spatialfdr
qvals <- tec.spatialfdr
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE TRUE
logical 54 37
Interesting, 11 of the neighbourhoods are no long statistically significant after the spatial FDR correction. Clearly there is a trade-off between neighbourhood size, sensitivity and power.
In each neighbourhood, what is the most common condition or block of cells?
tec.neighbour.exprs <- neighborhood_expression(tec.vertex.list, tec.sub.gex)
Embed these hyperspheres with a PCA and UMAP.
tec.neighbour.pca <- prcomp((t(tec.neighbour.exprs[tec.hvgs$HVG, ])))
pairs(tec.neighbour.pca$x[, c(1:5)])
set.seed(42)
neighbourhood.umap <- umap(tec.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=11, metric='euclidean',
init='random', min_dist=0.1)
We can overlay the DA testing on these neighbourhoods.
tec.neighbor.df <- tec.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
tec.neighbor.df <- do.call(cbind.data.frame, list(tec.neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(tec.neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
tec.neighbor.df$Sig <- as.numeric(tec.neighbor.df$SpatialFDR <= 0.05)
ggplot(tec.neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
Some are up and some are down. Which TEC subtypes do they largely correspond with?
tec.neighbour.list <- list()
for(x in seq_along(1:length(tec.vertex.list))){
x.df <- tec.umap.merge[tec.umap.merge$Vertex %in% tec.vertex.list[[x]], ]
x.rep <- names(table(x.df$SortDay))[which(table(x.df$SortDay) == max(table(x.df$SortDay)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Cluster))[which(table(x.df$Cluster) == max(table(x.df$Cluster)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Age))[which(table(x.df$Age) == max(table(x.df$Age)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
tec.neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Cluster"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
tec.neighbour.meta <- do.call(rbind.data.frame, tec.neighbour.list)
tec.neighbour.merge <- merge(tec.neighbor.df, tec.neighbour.meta, by='Neighbourhood')
tec.neighbour.merge$Diff <- sign(tec.neighbour.merge$logFC)
tec.neighbour.merge$Diff[tec.neighbour.merge$Sig == 0] <- 0
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge[tec.neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Cluster)
There is a subset of the Proliferating TEC that are down, good, as are the Perinatal cTEC. Likewise, most of the Intertypical TEC are up as well, but some are also down. I don’t know if this is because of a compositional effect or because these are the ones that would be differentiating into mTEC via the Proliferating TEC compartment.
table(tec.neighbour.merge$Cluster, tec.neighbour.merge$Diff)
-1 0 1
Mature mTEC 0 30 0
Intertypical TEC 4 14 17
Perinatal cTEC 8 1 0
Mature cTEC 1 1 6
Post-Aire mTEC 0 4 0
Proliferating TEC 1 3 0
Tuft-like mTEC 0 1 0
I would say that these results make a lot of sense. Can I now extend this to include all time points and fit age as a linear ordinal variable?
# exclude technical artifact cluster
tec.sub.meta <- tec.meta
tec.sub.meta$AgeFactor <- ordered(tec.sub.meta$Age,
levels=c("1wk", "4wk", "16wk", "32wk", "52wk"))
# add the label annotation
tec.sub.meta$Cluster <- "Unknown"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "2"] <- "Intertypical TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "9"] <- "Perinatal cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "3"] <- "Mature cTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "7"] <- "Mature mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "1"] <- "Post-Aire mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "5"] <- "Tuft-like mTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "6"] <- "Proliferating TEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "8"] <- "nTEC"
tec.sub.meta$Cluster[tec.sub.meta$TFIDF.Cluster == "10"] <- "sTEC"
inter.cols <- c("#9970ab", "#35978f", "#B0cdc1", "#762a83", "#01665e", "#e7d4e8", "#dfc27d", "#8c510a" ,"#bf812d")
names(inter.cols) <- c("Post-Aire mTEC", 'Intertypical TEC', 'Mature cTEC', 'Tuft-like mTEC',
'Proliferating TEC', 'Mature mTEC', 'nTEC', 'Perinatal cTEC', 'sTEC')
tec.sub.gex <- tec.gex[, colnames(tec.gex) %in% tec.sub.meta$Sample]
I’ll build a kNN-graph from the first 30 PCs previously computed on all TEC.
set.seed(42)
tec.knn <- buildKNNGraph(x=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]), k=11, d=NA, transposed=TRUE)
tec.fr.layout <- layout_with_fr(tec.knn)
plot(tec.knn, layout=tec.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
This is a fairly densely connected network, how does the UMAP look?
set.seed(42)
tec.umap <- umap(as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
n_components=2,
n_neighbors=11, metric='euclidean',
init='random', min_dist=0.2)
tec.umap.df <- as.data.frame(tec.umap$layout)
colnames(tec.umap.df) <- c("UMAP1", "UMAP2")
tec.umap.df$Sample <- tec.sub.meta$Sample
tec.umap.merge <- merge(tec.umap.df, tec.sub.meta, by='Sample')
ggplot(tec.umap.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Cluster)) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~AgeFactor) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
# randomly select vertices in the graph
n.hood <- 0.10
tec.random.vertices <- sample(V(tec.knn), size=floor(n.hood*length(V(tec.knn))))
# loop over random vertices and count the number of cells in each
tec.vertex.list <- sapply(1:length(tec.random.vertices), FUN=function(X) neighbors(tec.knn, v=tec.random.vertices[X]))
hist(unlist(lapply(tec.vertex.list, length)), 100, main="Histogram of neighbors", xlab="Neighbourhood size")
This is the histogram of TEC neighbourhood sizes.
tec.umap.merge$ExpSamp <- paste(tec.umap.merge$Age, tec.umap.merge$SortDay, sep="_")
tec.umap.merge$Vertex <- c(1:nrow(tec.umap.merge))
tec.counts <- quant_neighbourhood(graph=tec.knn, meta=tec.umap.merge, sample.column='ExpSamp', sample.vertices=n.hood)
tec.reps <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[2])))
tec.cond <- unlist(lapply(strsplit(unique(tec.umap.merge$ExpSamp), split="_"), FUN=function(X) paste0(X[1])))
tec.sample.meta <- data.frame("Condition"=tec.cond,
"Replicate"=tec.reps)
tec.sample.meta$Sample <- paste(tec.sample.meta$Condition, tec.sample.meta$Replicate, sep="_")
rownames(tec.sample.meta) <- tec.sample.meta$Sample
tec.sample.meta$Condition <- ordered(tec.sample.meta$Condition,
levels=c("1wk", "4wk", "16wk", "32wk", "52wk"))
tec.model <- model.matrix(~ Condition, data=tec.sample.meta)
head(tec.model)
(Intercept) Condition.L Condition.Q Condition.C Condition^4
1wk_4 1 -0.6324555 0.5345225 -0.3162278 0.1195229
4wk_3 1 -0.3162278 -0.2672612 0.6324555 -0.4780914
32wk_2 1 0.3162278 -0.2672612 -0.6324555 -0.4780914
32wk_5 1 0.3162278 -0.2672612 -0.6324555 -0.4780914
52wk_5 1 0.6324555 0.5345225 0.3162278 0.1195229
4wk_2 1 -0.3162278 -0.2672612 0.6324555 -0.4780914
tec.dge <- DGEList(tec.counts[, rownames(tec.model)], lib.size=log(colSums(tec.counts[, rownames(tec.model)])))
tec.dge <- estimateDisp(tec.dge, tec.model)
tec.fit <- glmQLFit(tec.dge, tec.model, robust=TRUE)
# tec.contrast <- makeContrasts(ConditionA - ConditionB, levels=tec.model)
# tec.res <- glmQLFTest(tec.fit, contrast=tec.contrast)
tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=2), sort.by='none', n=Inf))
tec.res$Sig <- as.factor(as.numeric(tec.res$FDR <= 0.05))
tec.res$Neighbourhood <- as.numeric(rownames(tec.res))
# control the spatial FDR
qvals <- tec.res$PValue
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE TRUE
logical 110 122
There are 85 DA neighbourhoods - I expect these should reflect the Perinatal, Intertypical, Proliferating and sTEC. I’ll use the distance-based approach to correct for the spatial FDR.
tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=tec.res[order(tec.res$Neighbourhood), ]$PValue)
tec.res$SpatialFDR[order(tec.res$Neighbourhood)] <- tec.spatialfdr
qvals <- tec.spatialfdr
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE TRUE
logical 136 96
Interesting, there are 26 neighbourhoods no longer statistically significant after the spatial FDR correction - hopefully these are genuinely false-positives.
In each neighbourhood, what is the most common condition or block of cells?
tec.neighbour.exprs <- neighborhood_expression(tec.vertex.list, tec.sub.gex)
Embed these hyperspheres with a PCA and UMAP.
tec.neighbour.pca <- prcomp((t(tec.neighbour.exprs[tec.hvgs$HVG, ])))
pairs(tec.neighbour.pca$x[, c(1:5)])
set.seed(42)
neighbourhood.umap <- umap(tec.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=11, metric='euclidean',
init='random', min_dist=0.1)
We can overlay the DA testing on these neighbourhoods.
tec.neighbor.df <- tec.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
tec.neighbor.df <- do.call(cbind.data.frame, list(tec.neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(tec.neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
tec.neighbor.df$Sig <- as.numeric(tec.neighbor.df$SpatialFDR <= 0.05)
ggplot(tec.neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=tec.neighbor.df[tec.neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
Some are up and some are down. Which TEC subtypes do they largely correspond with? That big streak of lower abundance neighbourhoods should be the differentiation trajectory from Intertypical to Mature mTEC.
tec.neighbour.list <- list()
for(x in seq_along(1:length(tec.vertex.list))){
x.df <- tec.umap.merge[tec.umap.merge$Vertex %in% tec.vertex.list[[x]], ]
x.rep <- names(table(x.df$SortDay))[which(table(x.df$SortDay) == max(table(x.df$SortDay)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Cluster))[which(table(x.df$Cluster) == max(table(x.df$Cluster)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Age))[which(table(x.df$Age) == max(table(x.df$Age)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
tec.neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Cluster"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
tec.neighbour.meta <- do.call(rbind.data.frame, tec.neighbour.list)
tec.neighbour.merge <- merge(tec.neighbor.df, tec.neighbour.meta, by='Neighbourhood')
tec.neighbour.merge$Diff <- sign(tec.neighbour.merge$logFC)
tec.neighbour.merge$Diff[tec.neighbour.merge$Sig == 0] <- 0
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge[tec.neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Cluster)
This extended analysis with k=11 also detects the increase in the small sTEC population. There also appears to be more heterogeneity in the Intertypical TEC, and, somewhat ingtriguingly, changes amongst the mature mTEC which were not detected originally.
table(tec.neighbour.merge$Cluster, tec.neighbour.merge$Diff)
-1 0 1
Mature mTEC 5 65 2
Proliferating TEC 11 6 0
Intertypical TEC 16 28 43
Perinatal cTEC 6 3 0
sTEC 0 0 1
Post-Aire mTEC 0 14 0
Mature cTEC 2 14 10
nTEC 0 3 0
Tuft-like mTEC 0 3 0
I would say that these results make a lot of sense. I’ll extend it to include the quadratic testing which should pick up the inverse-parabolic profile of the Post-Aire mTEC population.
quad.tec.res <- as.data.frame(topTags(glmQLFTest(tec.fit, coef=3), sort.by='none', n=Inf))
quad.tec.res$Sig <- as.factor(as.numeric(quad.tec.res$FDR <= 0.05))
quad.tec.res$Neighbourhood <- as.numeric(rownames(quad.tec.res))
# control the spatial FDR
qvals <- quad.tec.res$PValue
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE TRUE
logical 205 27
There are 27 DA neighbourhoods from the quadratic model - I expect these should reflect the Post-Aire mTEC.
quad.tec.spatialfdr <- graph_spatialFDR(neighborhoods=tec.vertex.list, graph=tec.knn, connectivity="distance",
pca=as.matrix(tec.sub.meta[, paste0("PC", 1:30)]),
pvalues=quad.tec.res[order(quad.tec.res$Neighbourhood), ]$PValue)
quad.tec.res$SpatialFDR[order(quad.tec.res$Neighbourhood)] <- quad.tec.spatialfdr
qvals <- quad.tec.spatialfdr
is.sig <- qvals <= 0.05
summary(is.sig)
Mode FALSE
logical 232
Dang, clearly still the small group of Post-Aire mTEC means this isn’t sufficiently sensitive after multiple-testing correction.
ggplot(tec.neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=tec.neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=tec.neighbour.merge,
aes(colour=Cluster), size=3) +
theme_clean() +
scale_colour_manual(values=inter.cols) +
facet_wrap(~Cluster)
Hmm, the Post-Aire mTEC don’t form a single neighbourhood on their own but rather 3 separate ones. I would say this possibly too granular.
Firstly, are compositional effects a problem here, and secondly, does the refined sampling scheme handle this? I’ll set up a new simulation that has 2 clusters, only one of which contains differentially abundant neighbourhoods.
set.seed(42)
r.n <- 1000
n.dim <- 50
block1.cells <- 250
# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues?
block1.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50))))
block1.eigens <- block1.eigens[order(block1.eigens)]
block1.p <- qr.Q(qr(matrix(rnorm(block1.cells^2, mean=4, sd=0.01), block1.cells)))
block1.sigma <- crossprod(block1.p, block1.p*block1.eigens)
block1.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block1.cells, mean=2, sd=0.01), sigma=block1.sigma))
block3.cells <- 250
# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues?
block3.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50))))
block3.eigens <- block3.eigens[order(block3.eigens)]
block3.p <- qr.Q(qr(matrix(rnorm(block3.cells^2, mean=4, sd=0.01), block3.cells)))
block3.sigma <- crossprod(block3.p, block3.p*block3.eigens)
block3.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block3.cells, mean=5, sd=0.01), sigma=block3.sigma))
sim2.gex <- do.call(cbind, list("b1"=block1.gex, "b3"=block3.gex))
sim2.pca <- prcomp_irlba(t(sim2.gex), n=50, scale.=TRUE, center=TRUE)
pairs(sim2.pca$x[, c(1:5)])
I’ll use the reduced dimensions here to compute a KNN-graph and visualise it using a Fructerman-Reingold layout.
set.seed(42)
sim2.knn <- buildKNNGraph(x=sim2.pca$x[, c(1:30)], k=21, d=NA, transposed=TRUE)
sim2.fr.layout <- layout_with_fr(sim2.knn)
plot(sim2.knn, layout=sim2.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
vertex.label.dist=1, edge.arrow.size=0.2)
Also a UMAP layout.
set.seed(42)
stem.ta.umap <- umap(sim2.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
plot(stem.ta.umap$layout, col=c(rep("red", block1.cells), rep("orange", block3.cells)),
xlab="UMAP 1", ylab="UMAP 2")
Within each of these clouds of points I will randomly label 1:9 in block 1 and 1:1 in block 2.
set.seed(42)
block1.cond <- rep("A", block1.cells)
block1.a <- sample(1:block1.cells, size=floor(block1.cells*0.1))
block1.b <- setdiff(1:block1.cells, block1.a)
block1.cond[block1.b] <- "B"
block3.cond <- rep("A", block3.cells)
block3.a <- sample(1:block3.cells, size=floor(block3.cells*0.5))
block3.b <- setdiff(1:block3.cells, block3.a)
block3.cond[block3.b] <- "B"
meta.df <- data.frame("Block"=c(rep("B1", block1.cells), rep("B3", block3.cells)),
"Condition"=c(block1.cond, block3.cond),
"Replicate"=c(rep("R1", floor(block1.cells*0.33)), rep("R2", floor(block1.cells*0.33)),
rep("R3", block1.cells-(2*floor(block1.cells*0.33))),
rep("R1", floor(block3.cells*0.33)), rep("R2", floor(block3.cells*0.33)),
rep("R3", block3.cells-(2*floor(block3.cells*0.33)))))
meta.df <- cbind(meta.df, stem.ta.umap$layout)
colnames(meta.df) <- c("Block", "Condition", "Replicate", "UMAP1", "UMAP2")
# define a "sample" as teh combination of condition and replicate
meta.df$Sample <- paste(meta.df$Condition, meta.df$Replicate, sep="_")
meta.df$Vertex <- c(1:nrow(meta.df))
ggplot(meta.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(aes(colour=Block, shape=Replicate)) +
theme_clean() +
scale_colour_npg() +
facet_wrap(~Condition) +
guides(colour=guide_legend(override.aes=list(size=3)),
shape=guide_legend(override.aes=list(size=3)))
The refined sampling scheme leads to large neighbourhoods overall - we think this might increase power and sensitivity as the counts in each will also be larger and therefore more stable.
sim2.counts <- quant_neighbourhood(graph=sim2.knn, meta=meta.df, sample.column='Sample', sample.vertices=n.hood)
sample.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)),
"Replicate"=rep(c("R1", "R2", "R3"), 2))
sample.meta$Sample <- paste(sample.meta$Condition, sample.meta$Replicate, sep="_")
rownames(sample.meta) <- sample.meta$Sample
# sim2.model <- model.matrix(~ 0 + Condition, data=sample.meta)
sim2.model <- model.matrix(~ Condition, data=sample.meta)
head(sim2.model)
(Intercept) ConditionB
A_R1 1 0
A_R2 1 0
A_R3 1 0
B_R1 1 1
B_R2 1 1
B_R3 1 1
I have a model matrix and counts matrix - let’s test edgeR on these.
count.means <- rowMeans(sim2.counts[, rownames(sim2.model)])
count.vars <- apply(sim2.counts[, rownames(sim2.model)], 1, var)
plot(count.means, count.vars)
The data are overdispersed. The model normalisation is causing some consternation. These all rely on normalising the neighbourhood counts by some factor. What if the normalisation uses the total number of cells in the experiment for each sample, rather than the counts in neighbourhoods, which will always be higher because cells are counted multiple times.
sim2.dge <- DGEList(sim2.counts[, rownames(sim2.model)], lib.size=log(colSums(sim2.counts[, rownames(sim2.model)])))
sim2.dge <- estimateDisp(sim2.dge, sim2.model)
sim2.fit <- glmQLFit(sim2.dge, sim2.model, robust=TRUE)
sim2.res <- as.data.frame(topTags(glmQLFTest(sim2.fit, coef=2), sort.by='none', n=Inf))
sim2.res$Neighbourhood <- as.numeric(rownames(sim2.res))
sim2.spatialfdr <- graph_spatialFDR(neighborhoods=vertex.list, graph=sim2.knn, connectivity="distance",
pvalues=sim2.res[order(sim2.res$Neighbourhood), ]$PValue,
pca=sim2.pca$x[, c(1:30)])
sim2.res$SpatialFDR[order(sim2.res$Neighbourhood)] <- sim2.spatialfdr
qvals <- sim2.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 29 21
This is at a 1% FDR.
sim2.neighbour.exprs <- neighborhood_expression(vertex.list, sim2.gex)
Embed these hyperspheres with a PCA and UMAP.
sim2.neighbour.pca <- prcomp((t(sim2.neighbour.exprs)))
set.seed(42)
neighbourhood.umap <- umap(sim2.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
plot(neighbourhood.umap$layout,
xlab="UMAP 1", ylab="UMAP 2")
We can overlay the DA testing on these neighbourhoods.
neighbor.df <- sim2.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
neighbor.df <- do.call(cbind.data.frame, list(neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
neighbor.df$Sig <- as.numeric(neighbor.df$SpatialFDR <= 0.01)
ggplot(neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbor.df[neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=neighbor.df[neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
Is that a subtle compositional effect in the 1 neighbourhood that is depleted? That cluster should not contain any DA neighbourhoods.
neighbour.list <- list()
for(x in seq_along(1:length(vertex.list))){
x.df <- meta.df[meta.df$Vertex %in% vertex.list[[x]], ]
x.rep <- names(table(x.df$Replicate))[which(table(x.df$Replicate) == max(table(x.df$Replicate)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Block))[which(table(x.df$Block) == max(table(x.df$Block)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Condition))[which(table(x.df$Condition) == max(table(x.df$Condition)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Block"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
neighbour.meta <- do.call(rbind.data.frame, neighbour.list)
neighbour.merge <- merge(neighbor.df, neighbour.meta, by='Neighbourhood')
neighbour.merge$Block <- ordered(neighbour.merge$Block,
levels=c("B1", "B3"))
neighbour.merge$Diff <- sign(neighbour.merge$logFC)
neighbour.merge$Diff[neighbour.merge$Sig == 0] <- 0
ggplot(neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=neighbour.merge[neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Block)
This doesn’t make sense - Block 3 shouldn’t have any DA neighbourhoods. Is this a compositional effect we’re seeing here? It’s strange that a random fluctuation would cause this - it must be incredibly sensitive. This is also sample-size dependent, smaller total sample sizes are less susceptible for some reason.
table(neighbour.merge$Block, neighbour.merge$Diff)
-1 0 1
B1 0 0 20
B3 1 29 0
That single depleted neighbourhood in B3 is, I think, a compositional effect. Does the refined sampling deal with this in some way, either by having neighbourhoods with larger, and thus more stable counts?
all.samps <- unique(paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_"))
meta.df$All.Sample <- paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_")
all.count.matrix <- matrix(0L, ncol=length(all.samps), nrow=length(vertex.list))
colnames(all.count.matrix) <- all.samps
for(x in seq_along(1:length(vertex.list))){
v.x <- vertex.list[[x]]
for(i in seq_along(1:length(all.samps))){
i.s <- all.samps[i]
i.s.vertices <- intersect(v.x, meta.df[meta.df$All.Sample == i.s, ]$Vertex)
all.count.matrix[x, i] <- length(i.s.vertices)
}
}
all.count.melt <- melt(all.count.matrix)
all.count.melt$Var2 <- as.character(all.count.melt$Var2)
all.count.melt$Block <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[1])))
all.count.melt$Condition <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[2])))
all.count.melt$Replicate <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[3])))
ggplot(all.count.melt[all.count.melt$Var1 %in% c(neighbour.merge$Neighbourhood[neighbour.merge$Sig == 1]) &
all.count.melt$Block %in% c("B3"), ],
aes(x=Block, y=value, fill=Condition)) +
geom_boxplot() +
theme_clean() +
facet_wrap(~Var1, scales="free_y")
These are the counts for B3 in the DANs. N21 looks like the effect is from sampling variance, as the counts are really quite low.
refine_vertex <- function(vertex.knn, v.ix, X_pca){
# vertex.knn: KNN graph for randomly sampled points (output of BiocNeighbors::findKNN)
# v.ix: index of vertex to refine in vertex.knn
## Calculate median profile of KNNs of vertex
v.med <- apply(X_pca[vertex.knn$index[v.ix,],], 2, median)
## Find the closest point to the median and sample
refined.vertex <- BiocNeighbors::findKNN(rbind(v.med, X_pca), subset=1, k=1)[["index"]][1] - 1 ## -1 to remove the median
return(refined.vertex)
}
quant_neighbourhood <- function(graph, meta, sample.column='Sample', sample.vertices=0.25, seed=42, pca=NULL, sample="random"){
set.seed(seed)
if(sample == "random"){
# define a set of vertices and neihbourhood centers - extract the neihbourhoods of these cells
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
} else if(sample == "refined"){
if(is.null(pca)){
stop("Please pass a PCA object - expected output from prcomp()")
}
X_pca <- pca$x[, c(1:30)]
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
vertex.list.refined <- sapply(1:length(refined.vertices), FUN=function(X) neighbors(graph, v=refined.vertices[X]))
vertex.list <- vertex.list.refined
}
count.matrix <- matrix(0L, ncol=length(unique(meta[, sample.column])), nrow=length(vertex.list))
colnames(count.matrix) <- unique(meta[, sample.column])
for(x in seq_along(1:length(vertex.list))){
v.x <- vertex.list[[x]]
for(i in seq_along(1:length(unique(meta[, sample.column])))){
i.s <- unique(meta[, sample.column])[i]
i.s.vertices <- intersect(v.x, meta[meta[, sample.column] == i.s, ]$Vertex)
count.matrix[x, i] <- length(i.s.vertices)
}
}
rownames(count.matrix) <- c(1:length(vertex.list))
return(count.matrix)
}
sim2.counts <- quant_neighbourhood(graph=sim2.knn, meta=meta.df, sample.column='Sample', sample.vertices=n.hood, sample="refined", pca=sim2.pca)
sample.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)),
"Replicate"=rep(c("R1", "R2", "R3"), 2))
sample.meta$Sample <- paste(sample.meta$Condition, sample.meta$Replicate, sep="_")
rownames(sample.meta) <- sample.meta$Sample
# sim2.model <- model.matrix(~ 0 + Condition, data=sample.meta)
sim2.model <- model.matrix(~ Condition, data=sample.meta)
head(sim2.model)
(Intercept) ConditionB
A_R1 1 0
A_R2 1 0
A_R3 1 0
B_R1 1 1
B_R2 1 1
B_R3 1 1
I have a model matrix and counts matrix - let’s test edgeR on these.
sim2.dge <- DGEList(sim2.counts[, rownames(sim2.model)], lib.size=log(colSums(sim2.counts[, rownames(sim2.model)])))
sim2.dge <- estimateDisp(sim2.dge, sim2.model, tagwise=TRUE)
sim2.fit <- glmQLFit(sim2.dge, sim2.model, robust=TRUE)
# sim2.contrast <- makeContrasts(ConditionA - ConditionB, levels=sim2.model)
# sim2.res <- glmQLFTest(sim2.fit, contrast=sim2.contrast)
sim2.res <- as.data.frame(topTags(glmQLFTest(sim2.fit, coef=2), sort.by='none', n=Inf))
sim2.res$Neighbourhood <- as.numeric(rownames(sim2.res))
sim2.spatialfdr <- graph_spatialFDR(neighborhoods=vertex.list.refined, graph=sim2.knn, connectivity="distance",
pvalues=sim2.res[order(sim2.res$Neighbourhood), ]$PValue,
pca=sim2.pca$x[, c(1:30)])
sim2.res$SpatialFDR[order(sim2.res$Neighbourhood)] <- sim2.spatialfdr
qvals <- sim2.spatialfdr
is.sig <- qvals <= 0.01
summary(is.sig)
Mode FALSE TRUE
logical 30 20
That’s a lot of DA neigbbourhoods!
sim2.neighbour.exprs <- neighborhood_expression(vertex.list.refined, sim2.gex)
Embed these hyperspheres with a PCA and UMAP.
sim2.neighbour.pca <- prcomp((t(sim2.neighbour.exprs)))
set.seed(42)
neighbourhood.umap <- umap(sim2.neighbour.pca$x[, c(1:30)],
n_components=2,
n_neighbors=21, metric='euclidean',
init='random', min_dist=0.1)
plot(neighbourhood.umap$layout,
xlab="UMAP 1", ylab="UMAP 2")
We can overlay the DA testing on these neighbourhoods.
neighbor.df <- sim2.res[, c("logFC", "Neighbourhood", "SpatialFDR")]
neighbor.df <- do.call(cbind.data.frame, list(neighbor.df, as.data.frame(neighbourhood.umap$layout)))
colnames(neighbor.df) <- c("logFC", "Neighbourhood", "SpatialFDR", "UMAP1", "UMAP2")
neighbor.df$Sig <- as.numeric(neighbor.df$SpatialFDR <= 0.01)
ggplot(neighbor.df, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbor.df[neighbor.df$Sig == 0, ],
colour='grey80', size=2) +
geom_point(data=neighbor.df[neighbor.df$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red")
No more false DANs in B3, despite the same number of neighbourhoods being counted.
neighbour.list <- list()
for(x in seq_along(1:length(vertex.list.refined))){
x.df <- meta.df[meta.df$Vertex %in% vertex.list.refined[[x]], ]
x.rep <- names(table(x.df$Replicate))[which(table(x.df$Replicate) == max(table(x.df$Replicate)))]
if(length(x.rep) > 1){
x.rep <- sample(size=1, x.rep)
}
x.block <- names(table(x.df$Block))[which(table(x.df$Block) == max(table(x.df$Block)))]
if(length(x.block) > 1){
x.block <- sample(size=1, x.block)
}
x.condition <- names(table(x.df$Condition))[which(table(x.df$Condition) == max(table(x.df$Condition)))]
if(length(x.condition) > 1){
x.condition <- sample(size=1, x.condition)
}
neighbour.list[[x]] <- data.frame("Replicate"=x.rep, "Block"=x.block, "Condition"=x.condition, "Neighbourhood"=x)
}
neighbour.meta <- do.call(rbind.data.frame, neighbour.list)
neighbour.merge <- merge(neighbor.df, neighbour.meta, by='Neighbourhood')
neighbour.merge$Block <- ordered(neighbour.merge$Block,
levels=c("B1", "B3"))
neighbour.merge$Diff <- sign(neighbour.merge$logFC)
neighbour.merge$Diff[neighbour.merge$Sig == 0] <- 0
ggplot(neighbour.merge, aes(x=UMAP1, y=UMAP2)) +
geom_point(data=neighbour.merge[, c("UMAP1", "UMAP2")],
colour='grey80', size=1) +
geom_point(data=neighbour.merge[neighbour.merge$Sig == 1, ],
aes(colour=logFC), size=4) +
theme_clean() +
scale_colour_gradient2(low="blue", mid="grey80", high="red") +
facet_wrap(~Block)
This doesn’t make sense - Block 3 shouldn’t have any DA neighbourhoods. Is this a compositional effect we’re seeing here? It’s strange that a random fluctuation would cause this - it must be incredibly sensitive.
table(neighbour.merge$Block, neighbour.merge$Diff)
0 1
B1 0 20
B3 30 0
all.samps <- unique(paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_"))
meta.df$All.Sample <- paste(meta.df$Block, meta.df$Condition, meta.df$Replicate, sep="_")
all.count.matrix <- matrix(0L, ncol=length(all.samps), nrow=length(vertex.list.refined))
colnames(all.count.matrix) <- all.samps
for(x in seq_along(1:length(vertex.list.refined))){
v.x <- vertex.list.refined[[x]]
for(i in seq_along(1:length(all.samps))){
i.s <- all.samps[i]
i.s.vertices <- intersect(v.x, meta.df[meta.df$All.Sample == i.s, ]$Vertex)
all.count.matrix[x, i] <- length(i.s.vertices)
}
}
all.count.melt <- melt(all.count.matrix)
all.count.melt$Var2 <- as.character(all.count.melt$Var2)
all.count.melt$Block <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[1])))
all.count.melt$Condition <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[2])))
all.count.melt$Replicate <- unlist(lapply(strsplit(all.count.melt$Var2, split="_", fixed=TRUE),
FUN=function(XP) paste0(XP[3])))
ggplot(all.count.melt[all.count.melt$Var1 %in% c(neighbour.merge$Neighbourhood[neighbour.merge$Sig == 1]) &
all.count.melt$Block %in% c("B3"), ],
aes(x=Block, y=value, fill=Condition)) +
geom_boxplot() +
theme_clean() +
facet_wrap(~Var1, scales="free_y")
If we want to base the spatial FDR correction on a distance then we need some way to retain this information in the graph to speed-up the calculations for large and highly-connected graphs. Our idea is to re-code the buildKNNGraph() function to retain cell-cell distances in the edge weights slot of the igraph object.
# set.seed(42)
# sim2.knn <- .buildKNNGraph(x=sim2.pca$x[, c(1:30)], k=21, d=NA, transposed=TRUE)
# sim2.fr.layout <- layout_with_fr(sim2.knn)
# plot(sim2.knn, layout=sim2.fr.layout, vertex.frame.color='skyblue', vertex.color='skyblue', vertex.label.color='black',
# vertex.label.family='Helvetica', edge.color='grey60', vertex.label.cex=0.9,
# vertex.label.dist=1, edge.arrow.size=0.2)
NB: Storing the distances as edge weights is far too memory intensive. Perhaps working off the back of the SingleCellExperiment would be better from a design point of view.
library(ggplot2)
library(SingleCellExperiment)
library(tidyverse)
library(igraph)
library(scran)
devtools::load_all("~/milo/miloR/")
Using dyntoy, I simulate a scRNA-seq data with cells forming a linear trajectory (no branches) with 5000 cells and 10 main states (milestones).
set.seed(42)
dataset <- generate_dataset(
model = model_linear(num_milestones = 10),
num_cells = 5000,
num_features = 5000
)
|=============================================================================== | 95% ~0 s remaining
|================================================================================= | 97% ~0 s remaining
|=================================================================================== | 99% ~0 s remaining
gex <- as.matrix(dataset$expression)
branches <- dataset$prior_information$groups_id
## Dimensionality reduction
pca <- prcomp_irlba(gex, n=50, scale.=TRUE, center=TRUE)
X_pca = pca$x[, c(1:30)]
I assign cells to simulated biological conditions and replicates, that we will use for differential abundance testing. For each of the \(M\) clusters, I assign different proportions of cells to condition A or condition B, while simulating proportionate mixing between replicates.
Construct a Milo object
sim_milo
class: Milo
dim: 5000 5000
metadata(0):
assays(2): counts logcounts
rownames(5000): G1 G2 ... G4999 G5000
rowData names(0):
colnames(5000): C1 C2 ... C4999 C5000
colData names(4): group_id condition replicate sample
reducedDimNames(1): PCA
altExpNames(0):
neighbourhoods dimensions(1): 0
neighbourhoodCounts dimensions(2): 1 1
neighbourDistances dimensions(2): 1 1
graph names(0):
neighbourhoodIndex names(1): 0
sim_milo <- buildGraph(sim_milo, k = 20, d = 30)
Constructing kNN graph with k:20
Retrieving distances from 20 nearest neighbours
Using sampling of 10% random points
sim_milo <- countCells(sim_milo, data = data.frame(colData(sim_milo)), samples = "sample")
Checking data validity
Setting up matrix with 500 neighbourhoods
Counting cells in neighbourhoods
head(neighbourhoodCounts(sim_milo))
6 x 6 sparse Matrix of class "dgCMatrix"
B_R1 A_R1 B_R2 A_R2 A_R3 B_R3
1 2 7 2 3 4 2
2 11 5 12 3 5 12
3 9 7 11 3 5 6
4 7 2 3 . 1 11
5 . 7 2 9 4 1
6 3 3 5 5 5 3
Visualize results in embedding
We adopt the refined sampling strategy applied in Wishbone, and adapted from here. Briefly, to avoid selecting outliers with random sampling, I first randomly select \(n\) cells. For each sampled cell I then identify its k neares neighbors and compute the median profile of the neighbors (in this case the profile in reduced PC space). Then I replace each sampled cell by the cell closest to the median profile of its neighbors.
sim_milo_ref <- makeNeighbourhoods(sim_milo,prop = 0.1, k=20, d=30, refined = TRUE)
Checking valid object
With the refined sampling scheme I select cells with a larger neighbourhood on average.
When \(n\) is large I often end up sampling less than \(n\) cells because for many randomly sampled cells the cell closest to the KNNs is the same.
seq(0.01,0.5, by = 0.05)
[1] 0.01 0.06 0.11 0.16 0.21 0.26 0.31 0.36 0.41 0.46
res_ref <- testNeighbourhoods(sim_milo_ref, ~ 1 + condition, data = design_df, fdr.weighting = "k-distance")
Performing spatial FDR correction
Refined sampling seems to be able to identify DA at both ends of the spectrum better
plotMiloReducedDim(sim_milo, res_rand, pt_size=2) + ggtitle("Random sampling")
plotMiloReducedDim(sim_milo_ref, res_ref, pt_size=2) + ggtitle("Refined sampling")
As expected multiple testing correction is less severe with the refined sample set (less points)
res_rand %>%
ggplot(aes(-log10(PValue), -log10(SpatialFDR))) +
geom_abline(linetype=2) +
geom_point() +
ggtitle("Refined sampling")
res_ref %>%
ggplot(aes(-log10(PValue), -log10(SpatialFDR))) +
geom_abline(linetype=2) +
ggtitle("Random sampling") +
geom_point()
I want to select these parameters to increase mean nh size
I want to check whether using refined sampling allows to have more logFC even with different sampling
mean_nh_sizes
[1] 28.64286 33.20732 35.25000 38.78472 38.53125 49.94444 62.41176 64.72549 74.22481 74.93976 63.75862 83.53968 90.72917
[14] 100.84348 106.32168 75.80000 100.18644 115.23077 124.29630 131.64085 84.44000 116.83051 136.33333 149.01010 160.13846 27.04762
[27] 30.27891 34.40984 35.60887 35.00300 48.63158 57.05983 64.41975 65.90868 68.34657 65.31111 78.37963 89.54015 93.79104
[40] 99.00800 76.78049 96.50000 112.80488 120.74011 125.87391 84.68293 112.50000 131.10000 143.05357 150.30097 26.48810 31.16949
[53] 31.85039 32.59831 33.01573 46.52055 55.45946 60.25123 61.62791 64.67363 60.72414 75.69173 85.32955 88.09506 92.12575
[66] 71.59259 94.60484 108.88387 112.68333 119.77517 80.38636 111.71287 128.65248 136.46635 144.13910 25.00980 28.42035 31.47619
[79] 31.28369 32.08755 45.50562 52.29775 58.47490 59.66289 62.16317 60.88889 74.37931 84.51402 85.57413 90.45431 72.01471
[92] 92.57812 107.51020 110.75746 115.67049 80.37288 106.41739 126.79651 133.06276 142.97603 26.21552 28.05512 31.16071 30.49407
[105] 31.15677 44.42105 52.66497 58.34082 56.54884 60.65984 59.76471 74.73171 83.37885 83.07163 87.92148 69.76000 92.60000
[118] 105.17647 108.30323 112.15365 79.95161 110.25210 126.00546 131.74170 138.67422
run_milo_sampling <- function(graph, meta.df, model, X_pca, seed=42, sample.vertices=0.1){
set.seed(seed)
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
vertex.list.refined <- sapply(1:length(refined.vertices), FUN=function(X) neighbors(graph, v=refined.vertices[X]))
count.matrix.random <- countCells(sim2.knn, meta.df, vertex.list = vertex.list, random.vertices = random.vertices, sample.column = "sample")
count.matrix.refined <- countCells(sim2.knn, meta.df, vertex.list = vertex.list.refined, random.vertices = refined.vertices, sample.column = "sample")
spFDR.random <- testQLF(graph, count.matrix.random, model)
spFDR.refined <- testQLF(graph, count.matrix.refined, model)
fdr.df.random <- data.frame(Vertex=as.integer(rownames(spFDR.random$res)), p=spFDR.random$res$PValue, adjp=spFDR.random$spFDR, adjp_fdr=spFDR.random$res$FDR, logFC=spFDR.random$res$logFC, Sig=spFDR.random$res$Sig)
fdr.df.refined <- data.frame(Vertex=as.integer(rownames(spFDR.refined$res)), p=spFDR.refined$res$PValue, adjp=spFDR.refined$spFDR, logFC=spFDR.refined$res$logFC, adjp_fdr=spFDR.refined$res$FDR, Sig=spFDR.refined$res$Sig)
return(list(random=fdr.df.random, refined=fdr.df.refined))
}
sample_perc5 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.05))
sample_perc10 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.1))
sample_perc15 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.15))
sample_perc20 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.2))
make_test_df <- function(sample_df){
sample_df %>%
imap( ~ bind_rows(.x[["refined"]] %>% dplyr::mutate(sampling="refined"),
.x[["random"]] %>% dplyr::mutate(sampling="random")) %>%
dplyr::mutate(s=.y)) %>%
purrr::reduce(bind_rows) %>%
left_join(data_5k_cells$meta.df) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>%
group_by(sampling, s, group_id) %>%
summarise(mean_logFC=mean(logFC))
}
map(list(perc5=sample_perc5, perc10=sample_perc10, perc15=sample_perc15, perc20=sample_perc20), ~ make_test_df(.x)) %>%
imap( ~ dplyr::mutate(.x, perc=.y)) %>%
purrr::reduce(bind_rows) %>%
ggplot(aes(group_id, mean_logFC, color=perc)) +
# geom_pointrange(stat = "summary",
# fun.min = min,
# fun.max = max,
# fun = mean) +
geom_boxplot(varwidth = TRUE) +
facet_grid(.~sampling) +
scale_fill_gradient2()
No big differences TBH
Do I get high/low FC where unexpected just because things are changing elsewhere?
data_2k_cells <- simulate_linear_traj(num_cells = 2000, num_milestones = 10, prob_start = 0.5, prob_end=0.95)
ggplot(data_2k_cells$meta.df, aes(UMAP1, UMAP2, color=condition)) + geom_point(size=0.2) +
theme_clean()
ggplot(data_2k_cells$meta.df, aes(UMAP1, UMAP2, color=group_id)) + geom_point(size=0.2) +
theme_clean() +
geom_text(data = . %>% group_by(group_id) %>% summarise(UMAP1=first(UMAP1), UMAP2=first(UMAP2)), aes(label=group_id), color="black")
graph <- data_2k_cells$graph
sample.vertices <- 0.1
meta.df <- data_2k_cells$meta.df
model <- data_2k_cells$model
X_pca <- data_2k_cells$X_pca
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
vertex.list.refined <- sapply(1:length(refined.vertices), FUN=function(X) neighbors(graph, v=refined.vertices[X]))
count.matrix.random <- countCells(graph, meta.df, vertex.list = vertex.list, random.vertices = random.vertices, sample.column = "sample")
count.matrix.refined <- countCells(graph, meta.df, vertex.list = vertex.list.refined, random.vertices = refined.vertices, sample.column = "sample")
spFDR.random <- testQLF(graph, count.matrix.random, model)
spFDR.refined <- testQLF(graph, count.matrix.refined, model)
Refined sampling seems to be able to identify DA at both ends of the spectrum better
fdr.df.random <- data.frame(Vertex=as.integer(rownames(spFDR.random$res)), p=spFDR.random$res$PValue, adjp=spFDR.random$spFDR, adjp_fdr=spFDR.random$res$FDR, logFC=spFDR.random$res$logFC, Sig=spFDR.random$res$Sig)
fdr.df.refined <- data.frame(Vertex=as.integer(rownames(spFDR.refined$res)), p=spFDR.refined$res$PValue, adjp=spFDR.refined$spFDR, logFC=spFDR.refined$res$logFC, adjp_fdr=spFDR.refined$res$FDR, Sig=spFDR.refined$res$Sig)
meta.df %>%
left_join(fdr.df.random) %>%
# dplyr::arrange(sampled) %>%
ggplot(aes(UMAP1, UMAP2,
# color= - log10(adjp),
# color= - log10(p),
color = logFC
)) +
geom_point(size=0.5) +
geom_point(data=. %>% dplyr::filter(!is.na(adjp))) +
theme_clean() +
scale_color_gradient2(midpoint = 0, high = "red", low="blue",na.value ="grey80") +
ggtitle("Random sampling")
meta.df %>%
left_join(fdr.df.refined) %>%
# dplyr::arrange(sampled) %>%
ggplot(aes(UMAP1, UMAP2,
# color= - log10(adjp),
# color= - log10(p),
color = logFC
)) +
geom_point(size=0.5) +
geom_point(data=. %>% dplyr::filter(!is.na(adjp))) +
theme_clean() +
scale_color_gradient2(midpoint = 0, high = "red", low="blue",na.value ="grey80") +
ggtitle("Refined sampling")
meta.df %>%
left_join(fdr.df.refined) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(logFC, -log10(adjp), shape=Sig, color=group_id)) +
geom_point() +
ggtitle("refined sampling") +
meta.df %>%
left_join(fdr.df.random) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(logFC, -log10(adjp), shape=Sig, color=group_id)) +
geom_point() +
ggtitle("random sampling")
num_milestones=10
meta.df %>%
ungroup() %>%
left_join(fdr.df.refined) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(group_id, logFC, shape=Sig, color=group_id, size= -log10(adjp))) +
geom_jitter() +
ggtitle("refined sampling") +
meta.df %>%
ungroup() %>%
left_join(fdr.df.random) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>% dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(group_id, logFC, shape=Sig, color=group_id, size= -log10(adjp))) +
geom_jitter() +
ggtitle("random sampling")
simulate_linear_traj <- function(num_cells, num_milestones, num_features=1000, k_param=21, seed=42,
prob_start=0.1, prob_end=0.9){
set.seed(seed)
## Generate simulated dataset of trajectory
dataset <- generate_dataset(
model = model_linear(num_milestones = num_milestones),
num_cells = num_cells,
num_features = num_features
)
sim2.gex <- as.matrix(dataset$expression)
sim2.branches <- dataset$prior_information$groups_id
sim2.time = dataset$prior_information$timecourse_continuous
## Build graph
sim2.pca <- prcomp_irlba(sim2.gex, n=50, scale.=TRUE, center=TRUE)
X_pca = sim2.pca$x[, c(1:30)]
sim2.knn <- buildKNNGraph(x=X_pca, k=k_param, d=NA, transposed=TRUE)
## Run UMAP
stem.ta.umap <- umap(sim2.pca$x[, c(1:30)],
n_components=2,
n_neighbors=k_param, metric='euclidean',
init='random', min_dist=0.1)
dyn.df <- data.frame(UMAP1=stem.ta.umap$layout[,1], UMAP2=stem.ta.umap$layout[,2],
cell_id=rownames(sim2.gex), time=sim2.time)
dyn.df <- dyn.df %>% left_join(sim2.branches)
## Simulate conditions
n_groups <- length(unique(dyn.df$group_id))
p_vec <- seq(prob_start, prob_end, length.out = n_groups)
a.cells <- c()
for (i in 1:n_groups) {
g <- paste0("M",i)
p <- p_vec[i]
m.A <- sample(dyn.df$cell_id[dyn.df$group_id==g],
size=floor(sum(dyn.df$group_id==g)*p))
a.cells <- c(a.cells, m.A)
}
dyn.df <- dyn.df %>% dplyr::mutate(condition = ifelse(cell_id %in% a.cells, "A", 'B'))
## Simulate replicates
dyn.df <- dyn.df %>%
group_by(group_id) %>%
dplyr::mutate(replicate=c(rep("R1", floor(n()*0.3)),
rep("R2", floor(n()*0.3)),
rep("R3", n() - 2*(floor(n()*0.3))))
)
## Add sample name (condition + replicate)
dyn.df$sample <- paste(dyn.df$condition, dyn.df$replicate, sep="_")
## Add vertex id (for counts)
dyn.df$Vertex <- as.vector(V(sim2.knn))
## Make model matrix for testing
sample.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)),
"Replicate"=rep(c("R1", "R2", "R3"), 2))
sample.meta$Sample <- paste(sample.meta$Condition, sample.meta$Replicate, sep="_")
rownames(sample.meta) <- sample.meta$Sample
sim2.model <- model.matrix(~ 0 + Condition, data=sample.meta)
return(list(graph=sim2.knn,
X_pca=X_pca,
meta.df=dyn.df,
model=sim2.model))
}
data_2k_cells <- simulate_linear_traj(num_cells = 2000, num_milestones = 10, prob_start = 0.5, prob_end=0.95)
graph <- data_2k_cells$graph
sample.vertices <- 0.1
meta.df <- data_2k_cells$meta.df
model <- data_2k_cells$model
X_pca <- data_2k_cells$X_pca
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
data.frame(random=as.numeric(random.vertices), refined=as.numeric(refined.vertices)) %>%
rowid_to_column() %>%
group_by(as.factor(refined)) %>%
dplyr::mutate(n_converging = n()) %>%
ungroup() %>%
pivot_longer(cols=c('random', "refined"), names_to = "sampling_scheme", values_to = "Vertex") %>%
left_join(meta.df, by="Vertex") %>%
ggplot(aes(time,sampling_scheme, color=n_converging)) +
geom_point(size=0.5) +
geom_line(aes(group=rowid), size=0.5) +
scale_color_viridis_c()
data.frame(random=as.numeric(random.vertices), refined=as.numeric(refined.vertices)) %>%
rowid_to_column() %>%
group_by(as.factor(refined)) %>%
dplyr::mutate(n_converging = n()) %>%
ggplot(aes(as.factor(n_converging))) + geom_histogram(stat="count")
Distances should become more uniform
get_dist_to_closest_neigh <- function(graph, sample.vertices){
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
dist_to_closest_random <- BiocNeighbors::findKNN(X=X_pca[as.vector(random.vertices),], k=1)[["distance"]]
dist_to_closest_refined <- BiocNeighbors::findKNN(X=X_pca[unique(as.vector(refined.vertices)),], k=1)[["distance"]]
dist_df <- bind_rows(data.frame(distance_to_closest=dist_to_closest_refined, sampling_scheme='refined'),
data.frame(distance_to_closest=dist_to_closest_random, sampling_scheme='random')) %>%
dplyr::mutate(sample_perc=sample.vertices)
}
dist_ls <- map(seq(0.1,0.6, by = 0.05), ~ get_dist_to_closest_neigh(graph, .x))
purrr::reduce(dist_ls, bind_rows) %>%
ggplot(aes(as.factor(sample_perc), distance_to_closest, color=sampling_scheme)) +
# ggbeeswarm::geom_quasirandom()
geom_boxplot(varwidth = TRUE) +
xlab("% sampled") + ylab("Distance to closest sample") +
theme_grey(base_size = 14)
data_5k_cells <- simulate_linear_traj(num_cells = 5000, num_milestones = 10)
meta.df <- data_5k_cells$meta.df
model <- data_5k_cells$model
X_pca <- data_5k_cells$X_pca
k_vec <- seq(10,50, by=5)
graph_ls <- map(k_vec, ~ buildKNNGraph(x=X_pca, k=.x, d=NA, transposed=TRUE))
get_neigh_df <- function(sampled_vertices, graph, X_pca, k_param, sampling_mode="random"){
if (sampling_mode=="refined") {
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=k_param, subset=as.vector(sampled_vertices))
sampled_vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
}
sampled_vertices <- unique(sampled_vertices)
vertex.list <- sapply(1:length(sampled_vertices), FUN=function(X) neighbors(graph, v=sampled_vertices[X]))
neigh_df <- data.frame(neigh_vertex=as.vector(sampled_vertices), neigh_size=sapply(vertex.list, function(x) length(x)),
sampling_mode=sampling_mode, k=k_param)
return(neigh_df)
}
neigh_df_ls <- lapply(seq_along(k_vec), function(i){
random_sample <- sample(V(graph_ls[[i]]), size=floor(sample.vertices*length(V(graph_ls[[i]]))))
sampled_vertices <- random_sample
random_neigh_df <- get_neigh_df(random_sample, graph_ls[[i]], X_pca, k_vec[i], sampling_mode="random")
refined_neigh_df <- get_neigh_df(random_sample, graph_ls[[i]], X_pca, k_vec[i], sampling_mode="refined")
bind_rows(random_neigh_df, refined_neigh_df)
})
purrr::reduce(neigh_df_ls, bind_rows) %>%
ggplot(aes(as.factor(k), neigh_size, color=sampling_mode)) +
geom_violin(scale = "width") +
geom_boxplot(width=0.2) +
facet_wrap(sampling_mode~.) +
xlab("K") + ylab("Neighborhood size") +
theme_clean(base_size = 18)
purrr::reduce(neigh_df_ls, bind_rows) %>%
ggplot(aes(as.factor(k), neigh_size / k, color=sampling_mode)) +
geom_violin(scale = "width") +
geom_boxplot(width=0.2) +
facet_wrap(sampling_mode~.) +
xlab("K") + ylab("Neighborhood size / K") +
theme_clean(base_size = 18)
purrr::reduce(neigh_df_ls, bind_rows) %>%
group_by(sampling_mode, k) %>%
summarise(n=n()) %>%
ggplot(aes(k,n, color=sampling_mode)) +
geom_point()
library(SingleCellExperiment)
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply,
parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call,
duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
Reduce, rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which,
which.max, which.min
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor,
see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: DelayedArray
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following objects are masked from ‘package:Biobase’:
anyMissing, rowMedians
Attaching package: ‘DelayedArray’
The following objects are masked from ‘package:matrixStats’:
colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
The following objects are masked from ‘package:base’:
aperm, apply, rowsum
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.2 [32m✓[30m [34mpurrr [30m 0.3.4
[32m✓[30m [34mtibble [30m 3.0.3 [32m✓[30m [34mdplyr [30m 1.0.1
[32m✓[30m [34mtidyr [30m 1.1.1 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mcollapse()[30m masks [34mIRanges[30m::collapse()
[31mx[30m [34mdplyr[30m::[32mcombine()[30m masks [34mBiobase[30m::combine(), [34mBiocGenerics[30m::combine()
[31mx[30m [34mdplyr[30m::[32mcount()[30m masks [34mmatrixStats[30m::count()
[31mx[30m [34mdplyr[30m::[32mdesc()[30m masks [34mIRanges[30m::desc()
[31mx[30m [34mtidyr[30m::[32mexpand()[30m masks [34mS4Vectors[30m::expand()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mfirst()[30m masks [34mS4Vectors[30m::first()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mggplot2[30m::[32mPosition()[30m masks [34mBiocGenerics[30m::Position(), [34mbase[30m::Position()
[31mx[30m [34mpurrr[30m::[32mreduce()[30m masks [34mGenomicRanges[30m::reduce(), [34mIRanges[30m::reduce()
[31mx[30m [34mdplyr[30m::[32mrename()[30m masks [34mS4Vectors[30m::rename()
[31mx[30m [34mpurrr[30m::[32msimplify()[30m masks [34mDelayedArray[30m::simplify()
[31mx[30m [34mdplyr[30m::[32mslice()[30m masks [34mIRanges[30m::slice()[39m
library(igraph)
Attaching package: ‘igraph’
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:purrr’:
compose, simplify
The following object is masked from ‘package:tidyr’:
crossing
The following object is masked from ‘package:tibble’:
as_data_frame
The following objects are masked from ‘package:DelayedArray’:
path, simplify
The following object is masked from ‘package:GenomicRanges’:
union
The following object is masked from ‘package:IRanges’:
union
The following object is masked from ‘package:S4Vectors’:
union
The following objects are masked from ‘package:BiocGenerics’:
normalize, path, union
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(scran)
devtools::load_all("~/miloR/")
import scanpy as sc
import pandas as pd
import numpy as np
## To show plots inline
import matplotlib.pyplot as plt
import sys,os
plt.switch_backend('agg')
sys.path.insert(1, '/nfs/team205/ed6/bin/thATAC/preprocess_utils/')
import atac_utils
import scipy.sparse
from pathlib import Path
# sc._settings.ScanpyConfig.figdir = Path(r.outdir)
Load anndata object, downloaded from here following the link from Park et al. 2020
rna_adata = sc.read_h5ad("/nfs/team205/ed6/data/Park_scRNAseq/HTA08.v01.A06.Science_human_tcells.raw.h5ad")
rna_adata.X = rna_adata.raw.X
rna_adata.X = scipy.sparse.csc_matrix(rna_adata.X)
Load MOFA projection, to use as reduced dimensionality object
mofa_dims <- read.csv("/nfs/team205/ed6/data/thymus_data/thymus_MOFA_projection.csv") %>%
column_to_rownames("cell")
## Filter just the scRNA-seq cells and factors 4 knn graoh construciton
mofa_dims <- as.matrix(mofa_dims[rownames(py$rna_adata$obs),1:5])
Convert anndata to SingleCellExperiment object
adata <- py$rna_adata
cnt <- t(adata$X)
rownames(cnt) <- adata$var_names$to_list()
colnames(cnt) <- adata$obs_names$to_list()
logCnt <- log2(cnt + 1)
pca <- prcomp(t(vx))
# Create the SingleCellExperiment object
sce <- SingleCellExperiment(assay=list(counts=cnt, logcounts=logCnt), colData = adata$obs)
reducedDim(sce) <- mofa_dims
reducedDimNames(sce) <- "MOFA"
sce
## Save SingleCellExperiment
saveRDS(sce, "/nfs/team205/ed6/data/Park_scRNAseq/HTA08.v01.A06.Science_human_tcells.SingleCellExperiment.RDS")
sce <- readRDS("~/Downloads/HTA08.v01.A06.Science_human_tcells.SingleCellExperiment.RDS")
sce
class: SingleCellExperiment
dim: 33694 50514
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(50514): FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369-AAACCTGAGCCTATGT ... Human_colon_16S7985397-TTTGCGCAGAGCCCAA
Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
object.size(sce)
16486280 bytes
Cells in different samples where sorted using different FACS gates, which affect significantly the cell type composition of different samples. To simplify interpretation of DA analysis, I retain only samples that where obtained from total tissue and CD45+ cells, which have a similar cell type composition.
sce
class: SingleCellExperiment
dim: 33694 38081
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(38081): FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369-AAACCTGAGCCTATGT ... Human_colon_16S7985397-TTTGCGCAGAGCCCAA
Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
Make Milo object
milo <- Milo(sce)
milo
class: Milo
dim: 33694 38081
metadata(0):
assays(2): counts logcounts
rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
rowData names(0):
colnames(38081): FCAImmP7179369-AAACCTGAGCCCAATT FCAImmP7179369-AAACCTGAGCCTATGT ... Human_colon_16S7985397-TTTGCGCAGAGCCCAA
Human_colon_16S7985397-TTTGGTTAGTACGCGA
colData names(14): Sample donor ... cell types umap_density_Age
reducedDimNames(1): MOFA
altExpNames(0):
neighbourhoods dimensions(1): 0
neighbourhoodCounts dimensions(2): 1 1
neighbourDistances dimensions(2): 1 1
graph names(0):
neighbourhoodIndex names(1): 0
object.size(milo)
13060216 bytes
For now I use scran function instead of buildGraph from package because it’s very slow
## Rename MOFA dim reduction as PCA so buildGraph can find it
reducedDim(milo, "PCA") <- reducedDim(milo)
#
# library(BiocNeighbors)
# library(BiocParallel)
# milo <- buildGraph(milo, k = 30)
knn_graph <- buildKNNGraph(reducedDim(milo, "MOFA"), k=20, d=NA, transposed=TRUE)
miloR::graph(milo) <- knn_graph
Run umap
umap_th <- uwot::umap(reducedDim(milo, "MOFA"), n_neighbors=20 )
reducedDim(milo, 'UMAP') <- umap_th
scater::plotUMAP(milo, colour_by="Age", point_size=0.5, point_alpha=0.5) +
facet_wrap('colour_by')
small_milo <- milo[,which(milo$Age %in% c('7w','17w'))]
knn_graph <- buildKNNGraph(reducedDim(small_milo, "MOFA"), k=30, d=NA, transposed=TRUE)
miloR::graph(small_milo) <- knn_graph
Run umap
small_umap_th <- uwot::umap(reducedDim(small_milo, "MOFA"), n_neighbors=30 )
reducedDim(small_milo, 'UMAP') <- small_umap_th
scater::plotUMAP(small_milo, colour_by="Age", point_size=0.5, point_alpha=0.5) +
facet_wrap('colour_by')
Sample neighborhoods with refined sampling scheme
# milo@neighbourhoods <- list()
small_milo <- makeNeighbourhoods(small_milo, prop=0.1, k = 30, d=5, refined = TRUE, reduced_dims = "PCA", seed = 100)
Checking valid object
plotNeighborhoodSizeHist(small_milo, bins=100)
Make model matrix for testing. I use Age as an ordinal variable for testing.
th.meta <- data.frame(colData(small_milo)[,c("Sample","Age")])
th.meta$Age <- ordered(th.meta$Age, levels=c('7w','17w'))
th.meta <-
distinct(th.meta) %>%
rownames_to_column() %>%
select(Sample, Age) %>%
column_to_rownames("Sample")
th.meta %>%
# filter(Age=="16w")
ggplot(aes(Age)) + geom_bar()
th.model <- model.matrix(~ Age, data=th.meta)
th.model
(Intercept) Age.L
F29_TH_45P 1 0.7071068
F29_TH_45P_5GEX 1 0.7071068
C40_TH_TOT_1 1 -0.7071068
C40_TH_TOT_2 1 -0.7071068
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$Age
[1] "contr.poly"
small_milo <- countCells(small_milo,
data = data.frame(colData(small_milo)[,c("Sample","Age")]),
samples = "Sample")
Checking data validity
Setting up matrix with 765 neighbourhoods
Counting cells in neighbourhoods
graph_spatialFDR <- function(neighborhoods, graph, pvalues, connectivity='vertex', pca=NULL){
# input a set of neighborhoods as a list of graph vertices
# the input graph and the unadjusted GLM p-values
#' neighborhoods: list of vertices and their respective neighborhoods
#' graph: input kNN graph
#' pvalues: a vector of pvalues in the same order as the neighborhood indices
#' connectivity: character - edge or vertex to calculate neighborhood connectivity or distance to use average Euclidean distance
#' pca: matrix of PCs to calculate Euclidean distances, only required when connectivity == distance
# Discarding NA pvalues.
haspval <- !is.na(pvalues)
if (!all(haspval)) {
coords <- coords[haspval, , drop=FALSE]
pvalues <- pvalues[haspval]
}
# define the subgraph for each neighborhood then calculate the vertex connectivity for each
# this latter computation is quite slow - can it be sped up?
subgraphs <- lapply(1:length(neighborhoods[haspval]),
FUN=function(X) induced_subgraph(graph, neighborhoods[haspval][[X]]))
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
if(connectivity == "vertex"){
t.connect <- lapply(subgraphs, FUN=function(EG) vertex_connectivity(EG))
} else if(connectivity == "edge"){
t.connect <- lapply(subgraphs, FUN=function(EG) edge_connectivity(EG))
} else if(connectivity == "distance"){
if(!is.null(pca)){
t.connect <- lapply(1:length(neighborhoods[haspval]),
FUN=function(PG) {
x.pcs <- pca[neighborhoods[haspval][[PG]], ]
x.euclid <- as.matrix(dist(x.pcs))
x.distdens <- 1/mean(x.euclid[lower.tri(x.euclid, diag=FALSE)])
return(x.distdens)})
} else{
stop("A matrix of PCs is required to calculate distances")
}
}else{
stop("connectivity option not recognised - must be either edge, vertex or distance")
}
# use 1/connectivity as the weighting for the weighted BH adjustment from Cydar
w <- 1/unlist(t.connect)
w[is.infinite(w)] <- 0
# Computing a density-weighted q-value.
o <- order(pvalues)
pvalues <- pvalues[o]
w <- w[o]
adjp <- numeric(length(o))
adjp[o] <- rev(cummin(rev(sum(w)*pvalues/cumsum(w))))
adjp <- pmin(adjp, 1)
if (!all(haspval)) {
refp <- rep(NA_real_, length(haspval))
refp[haspval] <- adjp
adjp <- refp
}
return(adjp)
}
# testQLF <- function(graph, nh_counts, th.model, connectivity='edge', pca=NULL){
nh_counts <- small_milo@neighbourhoodCounts
dge <- DGEList(nh_counts[, rownames(th.model)], lib.size=log(colSums(nh_counts)))
dge <- estimateDisp(dge, th.model)
fit <- glmQLFit(dge, th.model, robust=TRUE)
# sim2.contrast <- makeContrasts(ConditionA - ConditionB, levels=th.model)
# sim2.res <- glmQLFTest(sim2.fit, contrast=sim2.contrast)
milo_res <- as.data.frame(topTags(glmQLFTest(fit, coef=1), sort.by='none', n=Inf))
milo_res$Sig <- as.factor(as.numeric(milo_res$FDR <= 0.05))
milo_res$Neighbourhood <- as.numeric(rownames(milo_res))
sim2.spatialfdr <- graph_spatialFDR(neighborhoods=small_milo@neighbourhoods,
graph=small_milo@graph[["graph"]],
connectivity="distance",
pvalues=milo_res$PValue,
pca=reducedDim(milo,"MOFA")
)
milo_res_df <- data.frame(Vertex=names(small_milo@neighbourhoods),
p=milo_res$PValue,
adjp=sim2.spatialfdr,
logFC=milo_res$logFC,
adjp_fdr=milo_res$FDR,
Sig=milo_res$Sig
)
milo_res_df %>%
mutate(is_sig=ifelse(adjp < 0.1, TRUE, FALSE)) %>%
ggplot(aes(logFC, -log10(adjp), color=is_sig)) +
geom_point(size=0.1)
# colData(small_milo) <- colData(sce[,which(sce$Age %in% c('7w','17w'))])
colData(small_milo)["Vertex"] <- as.character(V(graph(small_milo)))
coldata_df <-
SingleCellExperiment::colData(small_milo) %>%
as.data.frame() %>%
rownames_to_column() %>%
left_join(milo_res_df)
Joining, by = "Vertex"
coldata_df
# colData(small_milo)[which(small_milo$Vertex=="850"),]
# milo_res_df %>%
# filter(logFC > 0) %>% pull(Vertex)
# colnames(milo_res_df) %in% colnames(colData(small_milo))
coldata_df <- bind_cols(coldata_df, data.frame(reducedDim(small_milo, 'UMAP')))
coldata_df %>%
ggplot(aes(X1, X2)) +
geom_point(aes(color=Age), size=0.5)
knn_graph <- buildKNNGraph(reducedDim(milo, "MOFA"), k=30, d=NA, transposed=TRUE)
miloR::graph(milo) <- knn_graph
Run umap
umap_th <- uwot::umap(reducedDim(milo, "MOFA"), n_neighbors=30, verbose=TRUE)
reducedDim(milo, 'UMAP') <- umap_th
Sample neighborhoods with refined sampling scheme
# milo@neighbourhoods <- list()
system.time(milo <- makeNeighbourhoods(milo, prop=0.1, k = 30, d=5, refined = TRUE, reduced_dims = "PCA", seed = 100))
Checking valid object
user system elapsed
56.819 1.309 58.305
Make model matrix for testing. I use Age as an ordinal variable for testing.
th.model
(Intercept) Age.L Age.Q Age.C Age^4 Age^5 Age^6 Age^7 Age^8 Age^9
F21_TH_45P 1 0.38533732 0.17407766 -0.1511417 -0.41137668 -0.50128041 -0.4281744 -0.27517866 -0.13089258 -0.040816431
F22_TH_TOT 1 -0.27524094 -0.08703883 0.3778543 -0.31788198 -0.03580574 0.3892495 -0.50351840 0.37397880 -0.163265726
F23_TH_45P 1 -0.05504819 -0.34815531 0.1295501 0.33658092 -0.21483446 -0.3113996 0.32787245 0.26178516 -0.571430041
F29_TH_45P 1 0.49543369 0.52223297 0.4534252 0.33658092 0.21483446 0.1167748 0.05269379 0.01869894 0.004535159
F29_TH_45P_5GEX 1 0.49543369 0.52223297 0.4534252 0.33658092 0.21483446 0.1167748 0.05269379 0.01869894 0.004535159
F30_TH_45P 1 0.27524094 -0.08703883 -0.3778543 -0.31788198 0.03580574 0.3892495 0.50351840 0.37397880 0.163265726
F30_TH_45P_5GEX 1 0.27524094 -0.08703883 -0.3778543 -0.31788198 0.03580574 0.3892495 0.50351840 0.37397880 0.163265726
F38_TH_45P 1 0.16514456 -0.26111648 -0.3346710 0.05609682 0.39386318 0.2335497 -0.24590434 -0.52357031 -0.380953360
F38_TH_45P_5GEX 1 0.16514456 -0.26111648 -0.3346710 0.05609682 0.39386318 0.2335497 -0.24590434 -0.52357031 -0.380953360
F41_TH_45P 1 0.38533732 0.17407766 -0.1511417 -0.41137668 -0.50128041 -0.4281744 -0.27517866 -0.13089258 -0.040816431
F41_TH_45P_5GEX 1 0.38533732 0.17407766 -0.1511417 -0.41137668 -0.50128041 -0.4281744 -0.27517866 -0.13089258 -0.040816431
F45_TH_45P 1 0.05504819 -0.34815531 -0.1295501 0.33658092 0.21483446 -0.3113996 -0.32787245 0.26178516 0.571430041
F45_TH_45P_5GEX 1 0.05504819 -0.34815531 -0.1295501 0.33658092 0.21483446 -0.3113996 -0.32787245 0.26178516 0.571430041
F64_TH_TOT_5GEX_1 1 -0.05504819 -0.34815531 0.1295501 0.33658092 -0.21483446 -0.3113996 0.32787245 0.26178516 -0.571430041
F64_TH_TOT_5GEX_2 1 -0.05504819 -0.34815531 0.1295501 0.33658092 -0.21483446 -0.3113996 0.32787245 0.26178516 -0.571430041
C34_TH_TOT_5GEX 1 -0.27524094 -0.08703883 0.3778543 -0.31788198 -0.03580574 0.3892495 -0.50351840 0.37397880 -0.163265726
C40_TH_TOT_1 1 -0.49543369 0.52223297 -0.4534252 0.33658092 -0.21483446 0.1167748 -0.05269379 0.01869894 -0.004535159
C40_TH_TOT_2 1 -0.49543369 0.52223297 -0.4534252 0.33658092 -0.21483446 0.1167748 -0.05269379 0.01869894 -0.004535159
C41_TH_TOT_1 1 -0.38533732 0.17407766 0.1511417 -0.41137668 0.50128041 -0.4281744 0.27517866 -0.13089258 0.040816431
C41_TH_TOT_2 1 -0.38533732 0.17407766 0.1511417 -0.41137668 0.50128041 -0.4281744 0.27517866 -0.13089258 0.040816431
F74_TH_TOT_5GEX_1 1 -0.16514456 -0.26111648 0.3346710 0.05609682 -0.39386318 0.2335497 0.24590434 -0.52357031 0.380953360
F74_TH_TOT_5GEX_2 1 -0.16514456 -0.26111648 0.3346710 0.05609682 -0.39386318 0.2335497 0.24590434 -0.52357031 0.380953360
attr(,"assign")
[1] 0 1 1 1 1 1 1 1 1 1
attr(,"contrasts")
attr(,"contrasts")$Age
[1] "contr.poly"
heatmap(as.numeric(neighbourhoodCounts(milo)))
Error in heatmap(as.numeric(neighbourhoodCounts(milo))) :
'x' must be a numeric matrix
nh_counts <- milo@neighbourhoodCounts
dge <- DGEList(nh_counts[, rownames(th.model)], lib.size=log(colSums(nh_counts)))
dge <- estimateDisp(dge, th.model)
fit <- glmQLFit(dge, th.model, robust=TRUE)
# sim2.contrast <- makeContrasts(ConditionA - ConditionB, levels=th.model)
# sim2.res <- glmQLFTest(sim2.fit, contrast=sim2.contrast)
milo_res <- as.data.frame(topTags(glmQLFTest(fit, coef=1), sort.by='none', n=Inf))
milo_res$Sig <- as.factor(as.numeric(milo_res$FDR <= 0.05))
milo_res$Neighbourhood <- as.numeric(rownames(milo_res))
sim2.spatialfdr <- graph_spatialFDR(neighborhoods=milo@neighbourhoods,
graph=milo@graph[["graph"]],
connectivity="distance",
pvalues=milo_res$PValue,
pca=reducedDim(milo,"MOFA")
)
colData(milo) <- colData(sce)
colData(milo)["Vertex"] <- as.character(V(graph(milo)))
coldata_df <-
SingleCellExperiment::colData(milo) %>%
as.data.frame() %>%
rownames_to_column() %>%
left_join(milo_res_df)
Joining, by = "Vertex"
coldata_df
# colData(small_milo)[which(small_milo$Vertex=="850"),]
# milo_res_df %>%
# filter(logFC > 0) %>% pull(Vertex)
colnames(milo_res_df) %in% colnames(colData(milo))
[1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE
coldata_df <- bind_cols(coldata_df, data.frame(reducedDim(milo, 'UMAP')))
coldata_df %>%
arrange(- logFC) %>%
ggplot(aes(X1,X2)) +
geom_point(size=0.2, color="grey", alpha=0.5) +
geom_point(data=. %>% filter(!is.na(logFC)), aes(color=logFC), size=1) +
# scale_color_gradient2(mid=0, high = "red", low="blue") +
scale_color_viridis_c(option="magma") +
theme_dimred()
coldata_df %>%
arrange(- log10(adjp)) %>%
ggplot(aes(X1,X2)) +
geom_point(size=0.2, color="grey", alpha=0.5) +
geom_point(data=. %>% filter(!is.na(adjp)), aes(color=-log10(adjp))) +
scale_color_viridis_c() +
theme_dimred()
coldata_df %>%
arrange(nh_size) %>%
ggplot(aes(X1,X2)) +
geom_point(size=0.2, color="grey", alpha=0.5) +
geom_point(data=. %>% filter(!is.na(adjp)), aes(color=nh_size)) +
scale_color_viridis_c() +
theme_dimred()
NA
NA
coldata_df %>%
arrange(- logFC) %>%
ggplot(aes(X1,X2)) +
geom_point(size=0.2, color="grey", alpha=0.5) +
geom_point(data=. %>% filter(!is.na(logFC)), aes(color=logFC), size=1) +
# scale_color_gradient2(mid=0, high = "red", low="blue") +
scale_color_viridis_c(option="magma", direction=-1) +
facet_wrap(cell.types~.) +
theme_dimred()
coldata_df %>%
ggplot(aes(cell.types, fill=Age)) + geom_bar(position="fill") +
scale_fill_viridis_d() +
coord_flip()
coldata_df %>%
group_by(Age) %>%
mutate(n_ct=n()) %>%
ungroup() %>%
group_by(cell.types, Age) %>%
summarise(frac_ct=n()/n_ct) %>%
ggplot(aes(Age, frac_ct, color=cell.types)) +
geom_point() +
geom_line(aes(group=cell.types)) +
facet_wrap(cell.types~.)
`summarise()` regrouping output by 'cell.types', 'Age' (override with `.groups` argument)
th.meta <- data.frame(colData(sce)[,c("Sample","Age")])
keep.ages <- c('11w','12w','13w','14w','16w','17w')
th.meta$Age <- ordered(th.meta$Age, levels=c('7w','8w','9w','10w','11w','12w','13w','14w','16w','17w'))
th.meta <-
distinct(th.meta) %>%
filter(Age %in% keep.ages) %>%
mutate(Age = ordered(Age, levels=keep.ages)) %>%
rownames_to_column() %>%
select(Sample, Age) %>%
column_to_rownames("Sample")
th.meta %>%
ggplot(aes(Age)) + geom_bar()
th.model <- model.matrix(~ Age, data=th.meta)
th.model
# milo_filt <- milo[,which(milo$Age %in% keep.ages)]
milo <- countCells(milo,
data = data.frame(colData(milo)[,c("Sample","Age")]),
samples = "Sample")
milo@neighbourhoodCounts
graph_spatialFDR <- function(neighborhoods, graph, pvalues, connectivity='vertex', pca=NULL){
# input a set of neighborhoods as a list of graph vertices
# the input graph and the unadjusted GLM p-values
#' neighborhoods: list of vertices and their respective neighborhoods
#' graph: input kNN graph
#' pvalues: a vector of pvalues in the same order as the neighborhood indices
#' connectivity: character - edge or vertex to calculate neighborhood connectivity or distance to use average Euclidean distance
#' pca: matrix of PCs to calculate Euclidean distances, only required when connectivity == distance
# Discarding NA pvalues.
haspval <- !is.na(pvalues)
if (!all(haspval)) {
coords <- coords[haspval, , drop=FALSE]
pvalues <- pvalues[haspval]
}
# define the subgraph for each neighborhood then calculate the vertex connectivity for each
# this latter computation is quite slow - can it be sped up?
subgraphs <- lapply(1:length(neighborhoods[haspval]),
FUN=function(X) induced_subgraph(graph, neighborhoods[haspval][[X]]))
# now loop over these sub-graphs to calculate the connectivity - this seems a little slow...
if(connectivity == "vertex"){
t.connect <- lapply(subgraphs, FUN=function(EG) vertex_connectivity(EG))
} else if(connectivity == "edge"){
t.connect <- lapply(subgraphs, FUN=function(EG) edge_connectivity(EG))
} else if(connectivity == "distance"){
if(!is.null(pca)){
t.connect <- lapply(1:length(neighborhoods[haspval]),
FUN=function(PG) {
x.pcs <- pca[neighborhoods[haspval][[PG]], ]
x.euclid <- as.matrix(dist(x.pcs))
x.distdens <- 1/mean(x.euclid[lower.tri(x.euclid, diag=FALSE)])
return(x.distdens)})
} else{
stop("A matrix of PCs is required to calculate distances")
}
}else{
stop("connectivity option not recognised - must be either edge, vertex or distance")
}
# use 1/connectivity as the weighting for the weighted BH adjustment from Cydar
w <- 1/unlist(t.connect)
w[is.infinite(w)] <- 0
# Computing a density-weighted q-value.
o <- order(pvalues)
pvalues <- pvalues[o]
w <- w[o]
adjp <- numeric(length(o))
adjp[o] <- rev(cummin(rev(sum(w)*pvalues/cumsum(w))))
adjp <- pmin(adjp, 1)
if (!all(haspval)) {
refp <- rep(NA_real_, length(haspval))
refp[haspval] <- adjp
adjp <- refp
}
return(adjp)
}
# testQLF <- function(graph, nh_counts, th.model, connectivity='edge', pca=NULL){
nh_counts <- milo@neighbourhoodCounts
dge <- DGEList(nh_counts[, rownames(th.model)], lib.size=log(colSums(nh_counts)))
dge <- estimateDisp(dge, th.model)
fit <- glmQLFit(dge, th.model, robust=TRUE)
# sim2.contrast <- makeContrasts(ConditionA - ConditionB, levels=th.model)
# sim2.res <- glmQLFTest(sim2.fit, contrast=sim2.contrast)
milo_res <- as.data.frame(topTags(glmQLFTest(fit, coef=1), sort.by='none', n=Inf))
milo_res$Sig <- as.factor(as.numeric(milo_res$FDR <= 0.05))
milo_res$Neighbourhood <- as.numeric(rownames(milo_res))
sim2.spatialfdr <- graph_spatialFDR(neighborhoods=milo@neighbourhoods,
graph=milo@graph[["graph"]],
connectivity="distance",
pvalues=milo_res$PValue,
pca=reducedDim(milo,"MOFA")
)
test_mean_nh_size <- function(m, prop, k, n_cells, d=30){
m <- m[,sample(colnames(m), size = n_cells)]
m <- buildGraph(m, k = k, d = d)
refined_nh <- neighbourhoods(makeNeighbourhoods(m, prop=prop, k=k, d=d, refined = TRUE, seed=42))
return(mean(sapply(refined_nh, length)))
}
k_vec <- seq(10,50, by=5)
ncells_vec <- round(ncol(milo)*seq(0.1,1, by=0.1),0)
grid_df <- expand.grid(ncells_vec, k_vec)
colnames(grid_df) <- c("n_cells", "k")
mean_nh_sizes_th <- apply(grid_df, 1, function(x) test_mean_nh_size(milo, prop = 0.2, x["k"], x["n_cells"], d=5))
test_mean_nh_size(milo, prop = 0.2, grid_df[1,"k"], grid_df[1,"n_cells"], d=5)
Visualize results in embedding
We adopt the refined sampling strategy applied in Wishbone, and adapted from here. Briefly, to avoid selecting outliers with random sampling, I first randomly select \(n\) cells. For each sampled cell I then identify its k neares neighbors and compute the median profile of the neighbors (in this case the profile in reduced PC space). Then I replace each sampled cell by the cell closest to the median profile of its neighbors.
sim_milo_ref <- makeNeighbourhoods(sim_milo,prop = 0.1, k=20, d=30, refined = TRUE)
Checking valid object
With the refined sampling scheme I select cells with a larger neighbourhood on average.
When \(n\) is large I often end up sampling less than \(n\) cells because for many randomly sampled cells the cell closest to the KNNs is the same.
data.frame(sample_proportion=lapply(seq(0.01,0.5, by = 0.05), function(x) rep(x, 3)) %>% purrr::reduce(c), random = random_n, refined = refined_n) %>%
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
10: In readChar(file, size, TRUE) : truncating string with embedded nuls
pivot_longer(cols = - sample_proportion, names_to = 'sampling', values_to = "n") %>%
ggplot(aes(sample_proportion, n, color=sampling)) +
geom_point(size=1) +
theme_bw(base_size = 16)
res_ref <- testNeighbourhoods(sim_milo_ref, ~ 1 + condition, data = design_df, fdr.weighting = "k-distance")
Performing spatial FDR correction
Refined sampling seems to be able to identify DA at both ends of the spectrum better
plotMiloReducedDim(sim_milo, res_rand, pt_size=2) + ggtitle("Random sampling")
plotMiloReducedDim(sim_milo_ref, res_ref, pt_size=2) + ggtitle("Refined sampling")
As expected multiple testing correction is less severe with the refined sample set (less points)
res_rand %>%
ggplot(aes(-log10(PValue), -log10(SpatialFDR))) +
geom_abline(linetype=2) +
geom_point() +
ggtitle("Refined sampling")
res_ref %>%
ggplot(aes(-log10(PValue), -log10(SpatialFDR))) +
geom_abline(linetype=2) +
ggtitle("Random sampling") +
geom_point()
I want to select these parameters to increase mean nh size
grid_df %>%
mutate(mean_nh_size=mean_nh_sizes) %>%
group_by(n_cells, k) %>%
summarise(mean_nh_size=mean(mean_nh_size)) %>%
ggplot(aes(k, n_cells)) +
# geom_point() +
# geom_line(aes(group=n_cells)) +
geom_tile(aes(fill=mean_nh_size)) +
scale_fill_viridis_c()
`summarise()` regrouping output by 'n_cells' (override with `.groups` argument)
Try using real data (thymus dataset)
dim(reducedDim(milo, "PCA") )
[1] 38081 5
mean_nh_sizes_th <- apply(grid_df, 1, function(x) test_mean_nh_size(milo, prop = 0.2, x["k"], x["n_cells"], d=5))
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:10
Retrieving distances from 10 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
Constructing kNN graph with k:15
Retrieving distances from 15 nearest neighbours
Checking valid object
I want to check whether using refined sampling allows to have more logFC even with different sampling
mean_nh_sizes
[1] 28.64286 33.20732 35.25000 38.78472 38.53125 49.94444 62.41176 64.72549 74.22481 74.93976 63.75862 83.53968 90.72917
[14] 100.84348 106.32168 75.80000 100.18644 115.23077 124.29630 131.64085 84.44000 116.83051 136.33333 149.01010 160.13846 27.04762
[27] 30.27891 34.40984 35.60887 35.00300 48.63158 57.05983 64.41975 65.90868 68.34657 65.31111 78.37963 89.54015 93.79104
[40] 99.00800 76.78049 96.50000 112.80488 120.74011 125.87391 84.68293 112.50000 131.10000 143.05357 150.30097 26.48810 31.16949
[53] 31.85039 32.59831 33.01573 46.52055 55.45946 60.25123 61.62791 64.67363 60.72414 75.69173 85.32955 88.09506 92.12575
[66] 71.59259 94.60484 108.88387 112.68333 119.77517 80.38636 111.71287 128.65248 136.46635 144.13910 25.00980 28.42035 31.47619
[79] 31.28369 32.08755 45.50562 52.29775 58.47490 59.66289 62.16317 60.88889 74.37931 84.51402 85.57413 90.45431 72.01471
[92] 92.57812 107.51020 110.75746 115.67049 80.37288 106.41739 126.79651 133.06276 142.97603 26.21552 28.05512 31.16071 30.49407
[105] 31.15677 44.42105 52.66497 58.34082 56.54884 60.65984 59.76471 74.73171 83.37885 83.07163 87.92148 69.76000 92.60000
[118] 105.17647 108.30323 112.15365 79.95161 110.25210 126.00546 131.74170 138.67422
run_milo_sampling <- function(graph, meta.df, model, X_pca, seed=42, sample.vertices=0.1){
set.seed(seed)
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
vertex.list.refined <- sapply(1:length(refined.vertices), FUN=function(X) neighbors(graph, v=refined.vertices[X]))
count.matrix.random <- countCells(sim2.knn, meta.df, vertex.list = vertex.list, random.vertices = random.vertices, sample.column = "sample")
count.matrix.refined <- countCells(sim2.knn, meta.df, vertex.list = vertex.list.refined, random.vertices = refined.vertices, sample.column = "sample")
spFDR.random <- testQLF(graph, count.matrix.random, model)
spFDR.refined <- testQLF(graph, count.matrix.refined, model)
fdr.df.random <- data.frame(Vertex=as.integer(rownames(spFDR.random$res)), p=spFDR.random$res$PValue, adjp=spFDR.random$spFDR, adjp_fdr=spFDR.random$res$FDR, logFC=spFDR.random$res$logFC, Sig=spFDR.random$res$Sig)
fdr.df.refined <- data.frame(Vertex=as.integer(rownames(spFDR.refined$res)), p=spFDR.refined$res$PValue, adjp=spFDR.refined$spFDR, logFC=spFDR.refined$res$logFC, adjp_fdr=spFDR.refined$res$FDR, Sig=spFDR.refined$res$Sig)
return(list(random=fdr.df.random, refined=fdr.df.refined))
}
sample_perc5 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.05))
sample_perc10 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.1))
sample_perc15 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.15))
sample_perc20 <- map(2020:2025, ~ run_milo_sampling(data_5k_cells$graph, data_5k_cells$meta.df, data_5k_cells$model, data_5k_cells$X_pca, seed=.x, sample.vertices = 0.2))
make_test_df <- function(sample_df){
sample_df %>%
imap( ~ bind_rows(.x[["refined"]] %>% dplyr::mutate(sampling="refined"),
.x[["random"]] %>% dplyr::mutate(sampling="random")) %>%
dplyr::mutate(s=.y)) %>%
purrr::reduce(bind_rows) %>%
left_join(data_5k_cells$meta.df) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>%
group_by(sampling, s, group_id) %>%
summarise(mean_logFC=mean(logFC))
}
map(list(perc5=sample_perc5, perc10=sample_perc10, perc15=sample_perc15, perc20=sample_perc20), ~ make_test_df(.x)) %>%
imap( ~ dplyr::mutate(.x, perc=.y)) %>%
purrr::reduce(bind_rows) %>%
ggplot(aes(group_id, mean_logFC, color=perc)) +
# geom_pointrange(stat = "summary",
# fun.min = min,
# fun.max = max,
# fun = mean) +
geom_boxplot(varwidth = TRUE) +
facet_grid(.~sampling) +
scale_fill_gradient2()
No big differences TBH
Do I get high/low FC where unexpected just because things are changing elsewhere?
data_2k_cells <- simulate_linear_traj(num_cells = 2000, num_milestones = 10, prob_start = 0.5, prob_end=0.95)
ggplot(data_2k_cells$meta.df, aes(UMAP1, UMAP2, color=condition)) + geom_point(size=0.2) +
theme_clean()
ggplot(data_2k_cells$meta.df, aes(UMAP1, UMAP2, color=group_id)) + geom_point(size=0.2) +
theme_clean() +
geom_text(data = . %>% group_by(group_id) %>% summarise(UMAP1=first(UMAP1), UMAP2=first(UMAP2)), aes(label=group_id), color="black")
graph <- data_2k_cells$graph
sample.vertices <- 0.1
meta.df <- data_2k_cells$meta.df
model <- data_2k_cells$model
X_pca <- data_2k_cells$X_pca
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
vertex.list <- sapply(1:length(random.vertices), FUN=function(X) neighbors(graph, v=random.vertices[X]))
vertex.list.refined <- sapply(1:length(refined.vertices), FUN=function(X) neighbors(graph, v=refined.vertices[X]))
count.matrix.random <- countCells(graph, meta.df, vertex.list = vertex.list, random.vertices = random.vertices, sample.column = "sample")
count.matrix.refined <- countCells(graph, meta.df, vertex.list = vertex.list.refined, random.vertices = refined.vertices, sample.column = "sample")
spFDR.random <- testQLF(graph, count.matrix.random, model)
spFDR.refined <- testQLF(graph, count.matrix.refined, model)
Refined sampling seems to be able to identify DA at both ends of the spectrum better
fdr.df.random <- data.frame(Vertex=as.integer(rownames(spFDR.random$res)), p=spFDR.random$res$PValue, adjp=spFDR.random$spFDR, adjp_fdr=spFDR.random$res$FDR, logFC=spFDR.random$res$logFC, Sig=spFDR.random$res$Sig)
fdr.df.refined <- data.frame(Vertex=as.integer(rownames(spFDR.refined$res)), p=spFDR.refined$res$PValue, adjp=spFDR.refined$spFDR, logFC=spFDR.refined$res$logFC, adjp_fdr=spFDR.refined$res$FDR, Sig=spFDR.refined$res$Sig)
meta.df %>%
left_join(fdr.df.random) %>%
# dplyr::arrange(sampled) %>%
ggplot(aes(UMAP1, UMAP2,
# color= - log10(adjp),
# color= - log10(p),
color = logFC
)) +
geom_point(size=0.5) +
geom_point(data=. %>% dplyr::filter(!is.na(adjp))) +
theme_clean() +
scale_color_gradient2(midpoint = 0, high = "red", low="blue",na.value ="grey80") +
ggtitle("Random sampling")
meta.df %>%
left_join(fdr.df.refined) %>%
# dplyr::arrange(sampled) %>%
ggplot(aes(UMAP1, UMAP2,
# color= - log10(adjp),
# color= - log10(p),
color = logFC
)) +
geom_point(size=0.5) +
geom_point(data=. %>% dplyr::filter(!is.na(adjp))) +
theme_clean() +
scale_color_gradient2(midpoint = 0, high = "red", low="blue",na.value ="grey80") +
ggtitle("Refined sampling")
meta.df %>%
left_join(fdr.df.refined) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(logFC, -log10(adjp), shape=Sig, color=group_id)) +
geom_point() +
ggtitle("refined sampling") +
meta.df %>%
left_join(fdr.df.random) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(logFC, -log10(adjp), shape=Sig, color=group_id)) +
geom_point() +
ggtitle("random sampling")
num_milestones=10
meta.df %>%
ungroup() %>%
left_join(fdr.df.refined) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>%
dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(group_id, logFC, shape=Sig, color=group_id, size= -log10(adjp))) +
geom_jitter() +
ggtitle("refined sampling") +
meta.df %>%
ungroup() %>%
left_join(fdr.df.random) %>%
dplyr::mutate(group_id = factor(group_id, levels=paste0('M', 1:num_milestones))) %>% dplyr::filter(!is.na(logFC)) %>%
ggplot(aes(group_id, logFC, shape=Sig, color=group_id, size= -log10(adjp))) +
geom_jitter() +
ggtitle("random sampling")
simulate_linear_traj <- function(num_cells, num_milestones, num_features=1000, k_param=21, seed=42,
prob_start=0.1, prob_end=0.9){
set.seed(seed)
## Generate simulated dataset of trajectory
dataset <- generate_dataset(
model = model_linear(num_milestones = num_milestones),
num_cells = num_cells,
num_features = num_features
)
sim2.gex <- as.matrix(dataset$expression)
sim2.branches <- dataset$prior_information$groups_id
sim2.time = dataset$prior_information$timecourse_continuous
## Build graph
sim2.pca <- prcomp_irlba(sim2.gex, n=50, scale.=TRUE, center=TRUE)
X_pca = sim2.pca$x[, c(1:30)]
sim2.knn <- buildKNNGraph(x=X_pca, k=k_param, d=NA, transposed=TRUE)
## Run UMAP
stem.ta.umap <- umap(sim2.pca$x[, c(1:30)],
n_components=2,
n_neighbors=k_param, metric='euclidean',
init='random', min_dist=0.1)
dyn.df <- data.frame(UMAP1=stem.ta.umap$layout[,1], UMAP2=stem.ta.umap$layout[,2],
cell_id=rownames(sim2.gex), time=sim2.time)
dyn.df <- dyn.df %>% left_join(sim2.branches)
## Simulate conditions
n_groups <- length(unique(dyn.df$group_id))
p_vec <- seq(prob_start, prob_end, length.out = n_groups)
a.cells <- c()
for (i in 1:n_groups) {
g <- paste0("M",i)
p <- p_vec[i]
m.A <- sample(dyn.df$cell_id[dyn.df$group_id==g],
size=floor(sum(dyn.df$group_id==g)*p))
a.cells <- c(a.cells, m.A)
}
dyn.df <- dyn.df %>% dplyr::mutate(condition = ifelse(cell_id %in% a.cells, "A", 'B'))
## Simulate replicates
dyn.df <- dyn.df %>%
group_by(group_id) %>%
dplyr::mutate(replicate=c(rep("R1", floor(n()*0.3)),
rep("R2", floor(n()*0.3)),
rep("R3", n() - 2*(floor(n()*0.3))))
)
## Add sample name (condition + replicate)
dyn.df$sample <- paste(dyn.df$condition, dyn.df$replicate, sep="_")
## Add vertex id (for counts)
dyn.df$Vertex <- as.vector(V(sim2.knn))
## Make model matrix for testing
sample.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)),
"Replicate"=rep(c("R1", "R2", "R3"), 2))
sample.meta$Sample <- paste(sample.meta$Condition, sample.meta$Replicate, sep="_")
rownames(sample.meta) <- sample.meta$Sample
sim2.model <- model.matrix(~ 0 + Condition, data=sample.meta)
return(list(graph=sim2.knn,
X_pca=X_pca,
meta.df=dyn.df,
model=sim2.model))
}
data_2k_cells <- simulate_linear_traj(num_cells = 2000, num_milestones = 10, prob_start = 0.5, prob_end=0.95)
graph <- data_2k_cells$graph
sample.vertices <- 0.1
meta.df <- data_2k_cells$meta.df
model <- data_2k_cells$model
X_pca <- data_2k_cells$X_pca
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
data.frame(random=as.numeric(random.vertices), refined=as.numeric(refined.vertices)) %>%
rowid_to_column() %>%
group_by(as.factor(refined)) %>%
dplyr::mutate(n_converging = n()) %>%
ungroup() %>%
pivot_longer(cols=c('random', "refined"), names_to = "sampling_scheme", values_to = "Vertex") %>%
left_join(meta.df, by="Vertex") %>%
ggplot(aes(time,sampling_scheme, color=n_converging)) +
geom_point(size=0.5) +
geom_line(aes(group=rowid), size=0.5) +
scale_color_viridis_c()
data.frame(random=as.numeric(random.vertices), refined=as.numeric(refined.vertices)) %>%
rowid_to_column() %>%
group_by(as.factor(refined)) %>%
dplyr::mutate(n_converging = n()) %>%
ggplot(aes(as.factor(n_converging))) + geom_histogram(stat="count")
Distances should become more uniform
get_dist_to_closest_neigh <- function(graph, sample.vertices){
random.vertices <- sample(V(graph), size=floor(sample.vertices*length(V(graph))))
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=21, subset=as.vector(random.vertices))
refined.vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
dist_to_closest_random <- BiocNeighbors::findKNN(X=X_pca[as.vector(random.vertices),], k=1)[["distance"]]
dist_to_closest_refined <- BiocNeighbors::findKNN(X=X_pca[unique(as.vector(refined.vertices)),], k=1)[["distance"]]
dist_df <- bind_rows(data.frame(distance_to_closest=dist_to_closest_refined, sampling_scheme='refined'),
data.frame(distance_to_closest=dist_to_closest_random, sampling_scheme='random')) %>%
dplyr::mutate(sample_perc=sample.vertices)
}
dist_ls <- map(seq(0.1,0.6, by = 0.05), ~ get_dist_to_closest_neigh(graph, .x))
purrr::reduce(dist_ls, bind_rows) %>%
ggplot(aes(as.factor(sample_perc), distance_to_closest, color=sampling_scheme)) +
# ggbeeswarm::geom_quasirandom()
geom_boxplot(varwidth = TRUE) +
xlab("% sampled") + ylab("Distance to closest sample") +
theme_grey(base_size = 14)
data_5k_cells <- simulate_linear_traj(num_cells = 5000, num_milestones = 10)
meta.df <- data_5k_cells$meta.df
model <- data_5k_cells$model
X_pca <- data_5k_cells$X_pca
k_vec <- seq(10,50, by=5)
graph_ls <- map(k_vec, ~ buildKNNGraph(x=X_pca, k=.x, d=NA, transposed=TRUE))
get_neigh_df <- function(sampled_vertices, graph, X_pca, k_param, sampling_mode="random"){
if (sampling_mode=="refined") {
vertex.knn <- BiocNeighbors::findKNN(X=X_pca, k=k_param, subset=as.vector(sampled_vertices))
sampled_vertices <- V(graph)[sapply(1:nrow(vertex.knn$index), function(i) refine_vertex(vertex.knn, i, X_pca))]
}
sampled_vertices <- unique(sampled_vertices)
vertex.list <- sapply(1:length(sampled_vertices), FUN=function(X) neighbors(graph, v=sampled_vertices[X]))
neigh_df <- data.frame(neigh_vertex=as.vector(sampled_vertices), neigh_size=sapply(vertex.list, function(x) length(x)),
sampling_mode=sampling_mode, k=k_param)
return(neigh_df)
}
neigh_df_ls <- lapply(seq_along(k_vec), function(i){
random_sample <- sample(V(graph_ls[[i]]), size=floor(sample.vertices*length(V(graph_ls[[i]]))))
sampled_vertices <- random_sample
random_neigh_df <- get_neigh_df(random_sample, graph_ls[[i]], X_pca, k_vec[i], sampling_mode="random")
refined_neigh_df <- get_neigh_df(random_sample, graph_ls[[i]], X_pca, k_vec[i], sampling_mode="refined")
bind_rows(random_neigh_df, refined_neigh_df)
})
purrr::reduce(neigh_df_ls, bind_rows) %>%
ggplot(aes(as.factor(k), neigh_size, color=sampling_mode)) +
geom_violin(scale = "width") +
geom_boxplot(width=0.2) +
facet_wrap(sampling_mode~.) +
xlab("K") + ylab("Neighborhood size") +
theme_clean(base_size = 18)
purrr::reduce(neigh_df_ls, bind_rows) %>%
ggplot(aes(as.factor(k), neigh_size / k, color=sampling_mode)) +
geom_violin(scale = "width") +
geom_boxplot(width=0.2) +
facet_wrap(sampling_mode~.) +
xlab("K") + ylab("Neighborhood size / K") +
theme_clean(base_size = 18)
purrr::reduce(neigh_df_ls, bind_rows) %>%
group_by(sampling_mode, k) %>%
summarise(n=n()) %>%
ggplot(aes(k,n, color=sampling_mode)) +
geom_point()